In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import copy
import os
from collections import Counter

In [2]:
# loading the data
df_train_login = pd.read_csv('t_login.csv')
df_train_trade = pd.read_csv('t_trade.csv')
df_test_login = pd.read_csv('t_login_test.csv')
df_test_trade = pd.read_csv('t_trade_test.csv')

In [3]:
#-------process data first........

In [4]:
# sort the login table by id...
df_train_login.sort_values(by=['id', 'timestamp', 'result'], inplace=True, ascending=True)
df_test_login.sort_values(by=['id', 'timestamp', 'result'], inplace=True, ascending=True)

In [5]:
def get_timestamp(x):
    # string to datetime with the format
    x = datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
    # datetime to timestamp
    return str(time.mktime(x.timetuple()))

In [6]:
# sort the trade table by id...
df_train_trade['time'] = df_train_trade['time'].apply(lambda x: get_timestamp(x[:-2]))
df_train_trade.sort_values(by=['id', 'time'], inplace=True, ascending=True)

df_test_trade['time'] = df_test_trade['time'].apply(lambda x: get_timestamp(x[:-2]))
df_test_trade.sort_values(by=['id', 'time'], inplace=True, ascending=True)

In [7]:
def get_datetime(x):
    if not isinstance(x, float):
        x = float(x)
    return datetime.datetime.fromtimestamp(x)

# 判断target是否为array中的大多数
def is_majority(target, array, n):
    # 如果候选列表只有一个候选项目，则返回待定值2
    if len(array) == 1:
        return 2

    c = Counter(array).most_common(n)
    for item in c:
        if item[0] == target:
            return 1
    return 0

# 判断target是否在array中至少出现n次
def is_exists(target, array, n):
    return 1 if array.count(target) >= n else 0

In [8]:
def build_feature_0(login, trade):
    # features
    features = pd.DataFrame()
    features['time'] = trade['time']
    features['trade_real_month'] = trade['trade_real_month']
    features['trade_real_day'] = trade['trade_real_day']
    features['trade_real_hour'] = trade['trade_real_hour']
    features['trade_real_minute'] = trade['trade_real_minute']
    features['trade_real_second'] = trade['trade_real_second']
    
    print('Shape of Features 0:', features.shape)
    return features

In [9]:
def build_feature_1(login, trade):
    #features
    # 初始化特征列表
    last_login_time = []
    last_login_result = []
    last_login_timelong = []
    last_login_device = []
    last_login_from = []
    last_login_ip = []
    last_login_city = []
    last_login_type = []
    last_login_is_scan = []
    last_login_is_sec = []
    last_trade_time = []
    
    # 计数特征
    count_login_all = []  # 登录的总次数
    count_login_previous = []  # 当前时间是第几次登录
    count_trade_all = []
    count_trade_previous = []
    count_login_fail_all = []  # 登录失败总次数
    count_login_fail_previous = []
    count_login_succ_all = []
    count_login_succ_previous = []
    avg_login_previous = []  # 上次登录与登录时间平均值的差
    max_login_previous = []
    min_login_previous = []
    count_login_ip_all = []  #     上次登录的ip在所有登录记录中出现的次数，出现次数太多的ip显然是异常的
    count_login_device_all = []
    count_login_city_all = []
    
    # 类别特征
    is_login_device_usual = []
    is_login_from_usual = []
    is_login_ip_usual = []
    is_login_city_usual = []
    is_login_type_usual = []
    is_login_device_exists_previous = []  # 上次登录的设备是否在以前出现过
    is_login_from_exists_previous = []
    is_login_ip_exists_previous = []
    is_login_city_exists_previous = []
    is_login_type_exists_previous = []
    
    # 提前计算好一些数量，可以同时利用上测试集和训练集的特征
    # Start Pre-Calculating ... 
    set_ip = Counter(login['ip'].tolist())
    set_device = Counter(login['device'].tolist())
    set_city = Counter(login['city'].tolist())
    
    # 开始遍历所有的交易信息
    # Start Search All Trade Info ... 
    for index in range(trade.shape[0]):
        if index % 1000 == 0:
            print("Processing till line: ", index)
        each_line = trade.iloc[index]
        
        # 取出交易记录的id,time等信息
        trade_time = each_line['time']
        id_ = each_line['id']
        
        # 查找登录信息中同id的记录
        related_rows = login.loc[login['id'] == id]
        count_login_all.append(related_rows.shape[0])
        count_login_fail_all.append(len([1 for num in     related_rows['result'].tolist() if num != 1]))
        count_login_succ_all.append(len([1 for num in     related_rows['result'].tolist() if num == 1]))
        
        # 存在相同id
        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的登录信息
            previous_record = related_rows.loc[related_rows['time'] < float    (trade_time)]
            count_login_previous.append(previous_record.shape[0])
            count_login_fail_previous.append(len([1 for num in     previous_record['result'].tolist() if num != 1]))
            count_login_succ_previous.append(len([1 for num in     previous_record['result'].tolist() if num == 1]))
            
            # 交易前有登录信息
            if previous_record.shape[0] != 0:
                first_login_record = previous_record.iloc[0]  #     取第一次登录的记录
                last_login_record = previous_record.iloc[-1]  #     取最近的那一次登录记录
                
                last_login_time.append(trade_time -     last_login_record['time'])
                last_login_result.append(last_login_record['result'])
                last_login_timelong.append(last_login_record['timelong'])
                last_login_device.append(last_login_record['device'])
                last_login_from.append(last_login_record['log_from'])
                last_login_ip.append(last_login_record['ip'])
                last_login_city.append(last_login_record['city'])
                last_login_type.append(last_login_record['type'])
                last_login_is_scan.append(last_login_record['is_scan'])
                last_login_is_sec.append(last_login_record['is_sec'])
                
                # 计数特征
                avg_login_previous.append(np.average(    related_rows['timelong']) - last_login_record['timelong'])
                max_login_previous.append(np.max(related_rows['timelong'])     - last_login_record['timelong'])
                min_login_previous.append(np.min(related_rows['timelong'])     - last_login_record['timelong'])
                count_login_ip_all.append(set_ip[last_login_record['ip']])
                count_login_device_all.append(set_device[    last_login_record['device']])
                count_login_city_all.append(set_city[    last_login_record['city']])
                
                # 类别特征
                is_login_device_usual.append(is_majority(    last_login_record['device'], related_rows['device'], 2))      # 上次登录的设备在所有登录设备中是否为常用设备
                is_login_from_usual.append(is_majority(    last_login_record['log_from'], related_rows['log_from'],     2))
                is_login_ip_usual.append(is_majority(    last_login_record['ip'], related_rows['ip'], 2))
                is_login_city_usual.append(is_majority(    last_login_record['city'], related_rows['city'], 2))
                is_login_type_usual.append(is_majority(    last_login_record['type'], related_rows['type'], 2))
    
                is_login_device_exists_previous.append(
                    is_exists(last_login_record['device'],     previous_record['device'].tolist(), 2))  #     上次登录的设备在之前是否登录过
                is_login_from_exists_previous.append(
                    is_exists(last_login_record['log_from'],     previous_record['log_from'].tolist(), 2))
                is_login_ip_exists_previous.append(
                    is_exists(last_login_record['ip'],     previous_record['ip'].tolist(), 2))
                is_login_city_exists_previous.append(
                    is_exists(last_login_record['city'],     previous_record['city'].tolist(), 2))
                is_login_type_exists_previous.append(
                    is_exists(last_login_record['type'],     previous_record['type'].tolist(), 2))
            
            # 交易前没有登录信息存在
            else:
                last_login_time.append(-1)  # 如果没有登录信息，说明用户之前没有    登录，但是后来马上有登录，可信度较大，则赋-1
                last_login_result.append(-40)
                last_login_timelong.append(-1)
                last_login_device.append(0)
                last_login_from.append(0)
                last_login_ip.append(0)
                last_login_city.append(0)
                last_login_type.append(0)
                last_login_is_scan.append(-1)
                last_login_is_sec.append(-1)
    
                # 计数特征
                avg_login_previous.append(0)
                max_login_previous.append(0)
                min_login_previous.append(0)
                count_login_ip_all.append(0)
                count_login_device_all.append(0)
                count_login_city_all.append(0)
    
                # 类别特征
                is_login_device_usual.append(0)
                is_login_from_usual.append(0)
                is_login_ip_usual.append(0)
                is_login_city_usual.append(0)
                is_login_type_usual.append(0)
    
                is_login_device_exists_previous.append(-1)  #     上次登录的设备在之前是否登录过
                is_login_from_exists_previous.append(-1)
                is_login_ip_exists_previous.append(-1)
                is_login_city_exists_previous.append(-1)
                is_login_type_exists_previous.append(-1)
        else:
            # 这里出现的都是没有出现登录记录的
    
            # 如果没有同id的信息，说明用户之前没有登录，后来也没有登录
            last_login_time.append(-2)
            last_login_result.append(-41)
            last_login_timelong.append(-2)
            last_login_device.append(0)
            last_login_from.append(0)
            last_login_ip.append(0)
            last_login_city.append(-1)
            last_login_type.append(-1)
            last_login_is_scan.append(-1)
            last_login_is_sec.append(-1)
    
            # 计数特征
            count_login_previous.append(0)
            avg_login_previous.append(0)
            max_login_previous.append(0)
            min_login_previous.append(0)
            count_login_succ_previous.append(0)
            count_login_fail_previous.append(0)
            count_login_ip_all.append(0)
            count_login_device_all.append(0)
            count_login_city_all.append(0)
    
            # 类别特征
            is_login_device_usual.append(0)
            is_login_from_usual.append(0)
            is_login_ip_usual.append(0)
            is_login_city_usual.append(0)
            is_login_type_usual.append(0)
    
            is_login_device_exists_previous.append(-2)
            is_login_from_exists_previous.append(-2)
            is_login_ip_exists_previous.append(-2)
            is_login_city_exists_previous.append(-2)
            is_login_type_exists_previous.append(-2)
        
        # 查找交易信息中同id的记录
        related_rows = trade.loc[trade['id'] == id]
        count_trade_all.append(related_rows.shape[0])
        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的交易信息
            previous_record = related_rows.loc[related_rows['time'] < float    (trade_time)]
            count_trade_previous.append(previous_record.shape[0])
        else:
            count_trade_previous.append(0)
    
        if index > 0:
            last_trade_record = trade.iloc[index - 1]
            if last_trade_record['id'] == id:
                last_trade_time.append(trade_time - float(    last_trade_record['time']))
            else:
                last_trade_time.append(-1)
        else:
            last_trade_time.append(-2)
    
    # 将得到的特征拼接起来
    features['last_login_time'] = last_login_time
    features['last_login_result'] = last_login_result
    features['last_login_timelong'] = last_login_timelong
    features['last_login_device'] = last_login_device
    features['last_login_from'] = last_login_from
    features['last_login_ip'] = last_login_ip
    features['last_login_city'] = last_login_city
    features['last_login_type'] = last_login_type
    features['last_login_is_scan'] = last_login_is_scan
    features['last_login_is_sec'] = last_login_is_sec
    features['last_trade_time'] = last_trade_time
    
    # 计数特征
    features['count_login_all'] = count_login_all
    features['count_login_previous'] = count_login_previous
    features['count_trade_all'] = count_trade_all
    features['count_trade_previous'] = count_trade_previous
    features['avg_login_previous'] = avg_login_previous
    features['max_login_previous'] = max_login_previous
    features['min_login_previous'] = min_login_previous
    features['count_login_fail_all'] = count_login_fail_all
    features['count_login_fail_previous'] = count_login_fail_previous
    features['count_login_succ_all'] = count_login_succ_all
    features['count_login_succ_previous'] = count_login_succ_previous
    features['count_login_ip_all'] = count_login_ip_all  #     上次登录的ip在所有登录记录中出现的次数
    features['count_login_device_all'] = count_login_device_all
    features['count_login_city_all'] = count_login_city_all
    
    # 类别特征
    features['is_login_device_usual'] = is_login_device_usual
    features['is_login_from_usual'] = is_login_from_usual
    features['is_login_ip_usual'] = is_login_ip_usual
    features['is_login_city_usual'] = is_login_city_usual
    features['is_login_type_usual'] = is_login_type_usual
    features['is_login_device_exists_previous'] =     is_login_device_exists_previous
    features['is_login_from_exists_previous'] =     is_login_from_exists_previous
    features['is_login_ip_exists_previous'] = is_login_ip_exists_previous
    features['is_login_city_exists_previous'] =     is_login_city_exists_previous
    features['is_login_type_exists_previous'] =     is_login_type_exists_previous
    
    print('Shape of Features 1:', features.shape)    
    return features

In [10]:
def build_feature(login, trade, type, train_trade):
    # basic time feature
    login['login_real_month'] = login['timestamp'].apply(lambda x:     get_datetime(x).month)
    login['login_real_day'] = login['timestamp'].apply(lambda x:     get_datetime(x).day)
    login['login_real_hour'] = login['timestamp'].apply(lambda x:     get_datetime(x).hour)
    login['login_real_minute'] = login['timestamp'].apply(lambda x:    get_datetime(x).minute)
    login['login_real_second'] = login['timestamp'].apply(lambda x:    get_datetime(x).second)
    
    trade['trade_real_month'] = trade['time'].apply(lambda x:    get_datetime(x).month)
    trade['trade_real_day'] = trade['time'].apply(lambda x:    get_datetime(x).day)
    trade['trade_real_hour'] = trade['time'].apply(lambda x:    get_datetime(x).hour)
    trade['trade_real_minute'] = trade['time'].apply(lambda x:    get_datetime(x).minute)
    trade['trade_real_second'] = trade['time'].apply(lambda x:    get_datetime(x).second)
    
    # convert time to timestamp
    login['time'] = login['timestamp']
    trade['time'] = trade['time'].apply(lambda x: float(x))
    del login['timestamp']
    del trade['rowkey']
    
    login['login_result'] = login['result'].apply(lambda x: 1 if x > 0 else 0) # 1 is login sucess while others are fail
    login['is_scan'] = login['is_scan'].apply(lambda x: 1 if x else 0)
    login['is_sec'] = login['is_sec'].apply(lambda x: 1 if x else 0)
    
    # 处理test时，将训练集中的历史交易数据单独放进来提取特征
    if type == 'test':
        train_trade['trade_real_month'] = train_trade['time'].apply(lambda x: get_datetime(x).month)
        train_trade['trade_real_day'] = train_trade['time'].apply(lambda x: get_datetime(x).day)
        train_trade['trade_real_hour'] = train_trade['time'].apply(lambda x: get_datetime(x).hour)
        train_trade['trade_real_minute'] = train_trade['time'].apply(lambda x: get_datetime(x).minute)
        train_trade['trade_real_second'] = train_trade['time'].apply(lambda x: get_datetime(x).second)
        train_trade['time'] = train_trade['time'].apply(lambda x: float(x))
        del train_trade['rowkey']
        previous_train_trade = train_trade
    else:
        previous_train_trade = trade
        
    # start build feature
    feature_0 = build_feature_0(login, trade)
    feature_1 = build_feature_1(login, trade)
    
    all_features = pd.contact([feature_0, feature_1], axis=1)
    return all_features

In [11]:
train_x = build_feature(df_train_login.copy(), df_train_trade.copy(), type='train', train_trade=None)
test_x = build_feature(df_test_login.copy(), df_test_trade.copy(), type='test', train_trade=df_train_trade.copy())
train_x.to_hdf('./train_x.hdf', 'w')
test_x.to_hdf('./test_x.hdf', 'w')

Shape of Features 0: (132719, 6)
Processing till line:  0
Processing till line:  100
Processing till line:  200
Processing till line:  300
Processing till line:  400
Processing till line:  500
Processing till line:  600
Processing till line:  700
Processing till line:  800
Processing till line:  900
Processing till line:  1000
Processing till line:  1100
Processing till line:  1200
Processing till line:  1300
Processing till line:  1400
Processing till line:  1500
Processing till line:  1600
Processing till line:  1700
Processing till line:  1800
Processing till line:  1900
Processing till line:  2000
Processing till line:  2100
Processing till line:  2200
Processing till line:  2300
Processing till line:  2400
Processing till line:  2500
Processing till line:  2600
Processing till line:  2700
Processing till line:  2800
Processing till line:  2900
Processing till line:  3000
Processing till line:  3100
Processing till line:  3200
Processing till line:  3300
Processing till line:  3400

KeyboardInterrupt: 

In [None]:
def one_hot(df_all_login, df_target, df_other, num):
    enc_result = OneHotEncoder()
    enc_result.fit(np.append(df_all_login['result'].values, 0).reshape(-1, 1))
    enc_device = OneHotEncoder()
    enc_device.fit(np.append(df_all_login['device'].values, 0).reshape(-1, 1))
    enc_logfrom = OneHotEncoder()
    enc_logfrom.fit(np.append(df_all_login['log_from'].values, 0).reshape(-1, 1))
    enc_ip = OneHotEncoder()
    enc_ip.fit(np.append(df_all_login['ip'].values, 0).reshape(-1, 1))
    enc_city = OneHotEncoder()
    enc_city.fit(np.append(df_all_login['city'].values, 0).reshape(-1, 1))
    enc_type = OneHotEncoder()
    enc_type.fit(np.append(df_all_login['type'].values, 0).reshape(-1, 1))
    
    feature_1 = pd.DataFrame(enc_result.transform(df_target['last_login_result'].values.reshape(-1, 1)).toarray())
    feature_1.columns = ["last_login_result_"+str(num) for num in range(1, 12)]
    del df_target['last_login_result']
    
    df_target['timezone'] = df_target['last_login_real_time'].apply(process_time)
    enc_time = OneHotEncoder()
    enc_time.fit(df_target['timezone'].values.reshape(-1, 1))
    feature_1_1 = pd.DataFrame(enc_time.transform(df_target['timezone'].values.reshape(-1, 1)).toarray())
    feature_1_1.columns = ['Last_Login_TimeZone_'+str(num) for num in range(1, 6)]
    del df_target['last_login_real_time']

    df_target['timezone'] = df_target['last_trade_real_time'].apply(process_time)
    enc_time = OneHotEncoder()
    enc_time.fit(df_target['timezone'].values.reshape(-1, 1))
    feature_1_2 = pd.DataFrame(enc_time.transform(df_target['timezone'].values.reshape(-1, 1)).toarray())
    feature_1_2.columns = ['Last_Trade_TimeZone_'+str(num) for num in range(1, 6)]
    del df_target['last_trade_real_time']

    del df_target['timezone']

    feature_1_3 = pd.DataFrame(enc_logfrom.transform(df_target['last_login_from'].values.reshape(-1, 1)).toarray())
    feature_1_3.columns = ["last_login_from_"+str(num) for num in range(1, 13)]
    del df_target['last_login_from']

    feature_1_4 = pd.DataFrame(enc_city.transform(df_target['last_login_city'].values.reshape(-1, 1)).toarray())
    feature_1_4.columns = ["last_login_city_" + str(num) for num in range(1, 486)]
    del df_target['last_login_city']

    feature_1_5 = pd.DataFrame(enc_type.transform(df_target['last_login_type'].values.reshape(-1, 1)).toarray())
    feature_1_5.columns = ["last_login_type_" + str(num) for num in range(1, 5)]
    del df_target['last_login_type']
    
    all_features = pd.concat([df_target, feature_1], axis=1)
    all_features = pd.concat([all_features, feature_1_1], axis=1)
    all_features = pd.concat([all_features, feature_1_2], axis=1)
    all_features = pd.concat([all_features, feature_1_3], axis=1)
    all_features = pd.concat([all_features, feature_1_4], axis=1)
    all_features = pd.concat([all_features, feature_1_5], axis=1)
    
    print(all_features.info())
    return all_features

In [None]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

print("Loading Data ... ")

# 导入数据
train_x, train_y, test_x = load_data()

# 用sklearn.cross_validation进行训练数据集划分，这里训练集和交叉验证集比例为7：3，可以自己根据需要设置
X, val_X, y, val_y = train_test_split(
    train_x,
    train_y,
    test_size=0.05,
    random_state=1,
    stratify=train_y ## 这里保证分割后y的比例分布与原数据一致
)

X_train = X
y_train = y
X_test = val_X
y_test = val_y


# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'num_leaves': 5,
    'max_depth': 6,
    'min_data_in_leaf': 450,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'lambda_l1': 1,  
    'lambda_l2': 0.001,  # 越小l2正则程度越高
    'min_gain_to_split': 0.2,
    'verbose': 5,
    'is_unbalance': True
}

# train
print('Start training...')
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)

print('Start predicting...')

preds = gbm.predict(test_x, num_iteration=gbm.best_iteration)  # 输出的是概率结果

# 导出结果
threshold = 0.5
for pred in preds:
    result = 1 if pred > threshold else 0

# 导出特征重要性
importance = gbm.feature_importance()
names = gbm.feature_name()
with open('./feature_importance.txt', 'w+') as file:
    for index, im in enumerate(importance):
        string = names[index] + ', ' + str(im) + '\n'
        file.write(string)