In [1]:
import sys
print("Python version: {}". format(sys.version))

import numpy as np
print("NumPy version: {}". format(np.__version__))

import pandas as pd
print("pandas version: {}". format(pd.__version__))

import lightgbm as lgb
print("LightGBM version: {}". format(lgb.__version__))

import warnings
warnings.filterwarnings('ignore')
print('-'*25)
pd.options.display.max_columns = None
pd.options.display.max_rows = None

Python version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
NumPy version: 1.19.5
pandas version: 1.2.2
LightGBM version: 3.1.1
-------------------------


In [2]:
# Load Dataset
data_path = "data"

# A榜原始数据
data_a_df = pd.read_csv(os.path.join(data_path, "data_a.csv")).set_index('phone')
data_a_df.replace('\\N', np.NaN, inplace=True)
to_pred_a = pd.read_csv(os.path.join(data_path, "to_pred_a.csv")).set_index('phone')
train_label = pd.read_csv(os.path.join(data_path, "train_label.csv")).set_index('phone')

# B榜原始数据
data_b_df = pd.read_csv(os.path.join(data_path, "data_b.csv")).set_index('phone')
data_b_df.replace('\\N', np.NaN, inplace=True)
to_pred_b = pd.read_csv(os.path.join(data_path, "to_pred_b.csv")).set_index('phone')

In [3]:
# 从object转换为float类型, 方便计算
f_features = ['if_family', 'gprs_fee', 'overrun_flux_fee', 'out_actvcall_dur', 'actvcall_fee',
       'out_activcall_fee', 'monfix_fee', 'gift_acct_amt', 'call_cnt',
       'up_flux', 'down_flux', 'p2psms_up_cnt',
       'p2psms_cmnct_fee', 'p2psms_pkg_fee']

data_a_df[f_features] = data_a_df[f_features].astype('float')
data_b_df[f_features] = data_b_df[f_features].astype('float')

In [4]:
# 分离训练与测试数据, 训练数据主体是1月, 测试数据主体是3月
train_val = pd.merge(data_a_df, train_label, on='phone', how='inner')
X_train_01 = train_val[train_val['month']==202001]
X_train_02 = train_val[train_val['month']==202002]

# test = pd.merge(data_a_df, to_pred_a, on='phone', how='inner') # A榜
test = pd.merge(data_b_df, to_pred_b, on='phone', how='inner') # B榜
X_test_03 = test[test['month']==202003]
X_test_04 = test[test['month']==202004]

In [5]:
# 把df2的某特征拼接到df1上(保持df1原特征不变)
def conact_feature(df1, df2, feature):
    df2 = df2[[feature]]
    df = pd.merge(df1, df2, on='phone', how='inner')
    return df

# 把df2的某特征值累加到df1上, 在处理NaN值上, 只要有一个为NaN, 累加值就为另一个不为NaN的值
def cumulative(df1, df2, feature):
    df1[feature] = np.where(pd.isnull(df2[feature]), df1[feature], df2[feature]+np.where(pd.isnull(df1[feature]), 0, df1[feature]))
    return df1

# 把df1和df2的特征取并集, 仅用于处理类别变量
def combine_cat_feature(df1, df2, feature):
    df1[feature].replace(np.NaN, 0, inplace=True)
    df2[feature].replace(np.NaN, 0, inplace=True)
    df1[feature] += df2[feature]
    df1[feature].replace(2, 1, inplace=True)
    df1[feature].replace(0, np.NaN, inplace=True)

In [6]:
# 构建训练集特征
def train_feature_engineering(df1, df2):
    df1 = conact_feature(df1, df2, 'monfix_fee')
    combine_cat_feature(df1, df2, 'if_family')
    return df1

# 构建测试集特征
def test_feature_engineering(df1, df2):
    df1 = conact_feature(df1, df2, 'monfix_fee')
    combine_cat_feature(df1, df2, 'if_family')
    df1 = cumulative(df1, df2, 'chrg_cnt')
    df1 = cumulative(df1, df2, 'gprs_fee')
    df1 = cumulative(df1, df2, 'overrun_flux_fee')
    df1 = cumulative(df1, df2, 'p2psms_up_cnt')
    df1 = cumulative(df1, df2, 'p2psms_cmnct_fee')
    df1 = cumulative(df1, df2, 'p2psms_pkg_fee')
    return df1

X_train = train_feature_engineering(X_train_01, X_train_02)
X_test = test_feature_engineering(X_test_03, X_test_04)


In [7]:
# 转换下列三个特征为类别特征
cat_cols = ['if_family', 'if_group', 'sms_inpkg_ind']
for cat_col in cat_cols:
    X_train[cat_col] = X_train[cat_col].astype('category')
    X_test[cat_col] = X_test[cat_col].astype('category')
    
y = X_train['label']

# 丢弃以下特征
drop_cols = ['label', 'month', 'chrg_amt', 'up_flux', 'down_flux']
X_train.drop(drop_cols, axis=1, inplace=True)

# 实际训练用到的所有特征
features = X_train.columns
features

Index(['if_family', 'if_group', 'chrg_cnt', 'gprs_fee', 'overrun_flux_fee',
       'out_actvcall_dur', 'actvcall_fee', 'out_activcall_fee', 'monfix_fee_x',
       'gift_acct_amt', 'call_cnt', 'sms_inpkg_ind', 'p2psms_up_cnt',
       'p2psms_cmnct_fee', 'p2psms_pkg_fee', 'monfix_fee_y'],
      dtype='object')

In [8]:
# 使用LightGBM训练模型
dtrain = lgb.Dataset(X_train, y, free_raw_data=False)
NFOLD = 5
SEED = 2
lgb_params = {
    'objective': 'binary',
    'force_row_wise': True,
    'boosting_type': 'gbdt',
    'learning_rate': 0.005,
    'subsample': 0.8,
    'subsample_freq':3,
    'colsample_bytree': 0.8,
    'num_leaves': 55,
    'max_depth': 14,
    'n_jobs': 16,
    'min_child_samples':20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'metric': ['auc', 'binary_logloss', 'binary_error'],
}
hist = lgb.cv(
    params=lgb_params,
    train_set=dtrain,
    categorical_feature=cat_cols,
    num_boost_round=2000,
    nfold=NFOLD,
    return_cvbooster=True,
    stratified=True,
    shuffle=True,
    verbose_eval=100,
    seed=SEED
)
print('binary_logloss: {}'.format(hist['binary_logloss-mean'][-1]))
print('binary_error:   {}'.format(hist['binary_error-mean'][-1]))
print('auc:            {}'.format(hist['auc-mean'][-1]))
predictions_lgb = np.array(hist['cvbooster'].predict(X_test[features]))

[LightGBM] [Info] Number of positive: 4787, number of negative: 341943
[LightGBM] [Info] Total Bins 2835
[LightGBM] [Info] Number of data points in the train set: 346730, number of used features: 16
[LightGBM] [Info] Number of positive: 4787, number of negative: 341943
[LightGBM] [Info] Total Bins 2835
[LightGBM] [Info] Number of data points in the train set: 346730, number of used features: 16
[LightGBM] [Info] Number of positive: 4787, number of negative: 341943
[LightGBM] [Info] Total Bins 2835
[LightGBM] [Info] Number of data points in the train set: 346730, number of used features: 16
[LightGBM] [Info] Number of positive: 4788, number of negative: 341943
[LightGBM] [Info] Total Bins 2835
[LightGBM] [Info] Number of data points in the train set: 346731, number of used features: 16
[LightGBM] [Info] Number of positive: 4787, number of negative: 341944
[LightGBM] [Info] Total Bins 2835
[LightGBM] [Info] Number of data points in the train set: 346731, number of used features: 16
[Ligh

In [9]:
# 
for i in range(NFOLD):
    predictions_lgb_i = predictions_lgb[i]
    print("第 {} 折:".format(i+1))
    for j in range(85, 95, 1):
        threshold = j/100
        fraud_list = np.where(predictions_lgb_i < threshold, 0, 1)
        print('阈值选择 {} 时, 预测有 {} 个fraud用户:'.format(threshold, sum(fraud_list)))

第 1 折:
阈值选择 0.85 时, 预测有 2605 个fraud用户:
阈值选择 0.86 时, 预测有 2590 个fraud用户:
阈值选择 0.87 时, 预测有 2590 个fraud用户:
阈值选择 0.88 时, 预测有 2589 个fraud用户:
阈值选择 0.89 时, 预测有 2579 个fraud用户:
阈值选择 0.9 时, 预测有 2576 个fraud用户:
阈值选择 0.91 时, 预测有 2205 个fraud用户:
阈值选择 0.92 时, 预测有 2205 个fraud用户:
阈值选择 0.93 时, 预测有 2205 个fraud用户:
阈值选择 0.94 时, 预测有 2205 个fraud用户:
第 2 折:
阈值选择 0.85 时, 预测有 2601 个fraud用户:
阈值选择 0.86 时, 预测有 2601 个fraud用户:
阈值选择 0.87 时, 预测有 2586 个fraud用户:
阈值选择 0.88 时, 预测有 2586 个fraud用户:
阈值选择 0.89 时, 预测有 2586 个fraud用户:
阈值选择 0.9 时, 预测有 2214 个fraud用户:
阈值选择 0.91 时, 预测有 2205 个fraud用户:
阈值选择 0.92 时, 预测有 2205 个fraud用户:
阈值选择 0.93 时, 预测有 1853 个fraud用户:
阈值选择 0.94 时, 预测有 1853 个fraud用户:
第 3 折:
阈值选择 0.85 时, 预测有 2594 个fraud用户:
阈值选择 0.86 时, 预测有 2579 个fraud用户:
阈值选择 0.87 时, 预测有 2579 个fraud用户:
阈值选择 0.88 时, 预测有 2578 个fraud用户:
阈值选择 0.89 时, 预测有 2577 个fraud用户:
阈值选择 0.9 时, 预测有 2577 个fraud用户:
阈值选择 0.91 时, 预测有 2205 个fraud用户:
阈值选择 0.92 时, 预测有 1853 个fraud用户:
阈值选择 0.93 时, 预测有 1853 个fraud用户:
阈值选择 0.94 时, 预测有 1845 个fraud用户:
第 4 折:
阈值选择 0.85 时, 预测

In [10]:
# 没有使用5折交叉验证的平均值, 而是用了测试效果最好的第一折的结果
# 根据经验阈值选择0.9时结果最好, 如果在使用规则前提交, 在线A榜F1成绩: 0.96318, B榜F1成绩: 0.960016
threshold = 0.9
X_test['label'] = np.where(predictions_lgb[0] < threshold, 0, 1)
submit = pd.merge(X_test_04.drop('label', axis=1), X_test['label'], on='phone', how='inner')
print('使用规则前, 预测有 {} 个fraud用户'.format(sum(submit['label'])))

使用规则前, 预测有 2576 个fraud用户


In [11]:
# 利用4月数据将错分为fraud的用户恢复正常, 依据经验设置具体规则
submit.loc[submit[(submit['label']==1) & (submit['out_activcall_fee']>0)].index, 'label'] = 0
submit.loc[submit[(submit['label']==1) & (submit['out_actvcall_dur']>20)].index, 'label'] = 0
submit.loc[submit[(submit['label']==1) & (submit['actvcall_fee']>5)].index, 'label'] = 0
submit.loc[submit[(submit['label']==1) & (submit['gift_acct_amt']>8)].index, 'label'] = 0
submit.loc[submit[(submit['label']==1) & (submit['call_cnt']>0)].index, 'label'] = 0

In [12]:
# 在使用规则后提交, B榜F1成绩: 0.961095
submit = submit.reset_index()[['phone','label']]
submit.to_csv('./Single_model_results/LightGBM_{}_{}.csv'.format(threshold, sum(submit['label'])), index=False, encoding="utf-8")
print('使用规则后, 阈值选择 {} 时, 预测有 {} 个fraud用户'.format(threshold, sum(submit['label'])))

使用规则后, 阈值选择 0.9 时, 预测有 2564 个fraud用户
