In [1]:
import multiprocessing
import warnings
import os
import gc
import random
import itertools
import pickle
import time
from datetime import datetime, timedelta
from pathlib import Path
from collections import Counter
from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# visualization 
import seaborn as sns
import matplotlib.pyplot as plt

# data preprocessing 
from itertools import product
import pandas as pd
import numpy as np
import missingno
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import shuffle

# model
import xgboost as xgb
import lightgbm as lgb
import catboost as cb 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

# eveluation 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support, roc_curve
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# hyperparameters tuning 
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize

# utils
import sys
sys.path.insert(0, '/tf/notebooks/other/kaggle')
from utils import utils_featexp
from utils import utils_features_engineering
from utils import utils_features_plots
from utils import utils_reduce_memory
from utils import utils_statistic

warnings.filterwarnings('ignore')
seed = 9527



In [2]:
main_path = Path('../input/ieee-cis-fraud-detection/')

with open(str(main_path / 'train_df.pkl'), 'rb') as handle:
    train_df = pickle.load(handle)
    
with open(str(main_path / 'test_df.pkl'), 'rb') as handle:
    test_df = pickle.load(handle)    

In [3]:
def id_split(df):
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]
    df['device_version'] = df['DeviceInfo'].str.split('/', expand=True)[1]
    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]
    df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'
    
    #Classes that are too sparse are placed in other classes
    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 100].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()
    
    return df

In [4]:
train_df = id_split(train_df)
test_df = id_split(test_df)

In [5]:
#Five new fields have been added
print(train_df.device_name.value_counts())
print(train_df.device_version.value_counts())
print(train_df.OS_id_30.value_counts())
print(train_df.browser_id_31.value_counts())

Windows            47722
iOS Device         19782
MacOS              12573
Samsung            12092
Trident             7440
Others              4978
RV                  4385
Motorola            2935
Huawei              2377
LG                  2331
Sony                 575
ZTE                  518
HTC                  406
hi6210sft Build      190
F3213 Build          125
Linux                121
F5121 Build          116
Name: device_name, dtype: int64
7.0                      7440
NRD90M                   5908
MMB29K                   1874
MRA58K                   1446
MMB29M                   1342
LRX22G                    757
NMF26X                    754
LMY48B                    740
LMY47V                    637
NRD90U                    564
R16NW                     494
MMB29T                    418
LMY47I                    413
HuaweiALE-L23             312
NMA26.42-69               285
KTU84P                    276
NPJ25.93-14.7             219
KOT49H                    202
MXB

# Features Engineering

In [6]:
for df in [train_df, test_df]:
    df['DT_D'] = ((df['TransactionDT'].dt.year - 2017)*365 + df['TransactionDT'].dt.dayofyear).astype(np.int16)
    df['DT_W'] = (df['TransactionDT'].dt.year - 2017)*52 + df['TransactionDT'].dt.weekofyear
    df['DT_M'] = (df['TransactionDT'].dt.year - 2017)*12 + df['TransactionDT'].dt.month
    

因为意识到ProductCD各个种类在欺诈和时间序列上的表现差异很大，所以将他们的count_encoding拆解为5个指标

Recognizing that the various categories of ProductCD behave differently in terms of fraud and time series, we split their count_encoding feature into five indicators

In [7]:
for i in train_df['ProductCD'].unique():
    new_col_name = 'c_product_{}_day'.format(i)
    train_df = utils_features_engineering.do_count(data=train_df, 
                                                   group_cols=['ProductCD', 'DT_D'],
                                                   target_col='isFraud', 
                                                   new_col_name=new_col_name, 
                                                   col_type=np.int16)
    
    test_df = utils_features_engineering.do_count(data=test_df, 
                                              group_cols=['ProductCD', 'DT_D'],
                                              target_col='TransactionAmt', 
                                              new_col_name=new_col_name, 
                                              col_type=np.int16)
    train_df.loc[train_df.ProductCD != i, new_col_name] = -999
    test_df.loc[test_df.ProductCD != i, new_col_name] = -999

[INFO] Count isFraud with group by ProductCD-DT_D combination...
[INFO] Count TransactionAmt with group by ProductCD-DT_D combination...
[INFO] Count isFraud with group by ProductCD-DT_D combination...
[INFO] Count TransactionAmt with group by ProductCD-DT_D combination...
[INFO] Count isFraud with group by ProductCD-DT_D combination...
[INFO] Count TransactionAmt with group by ProductCD-DT_D combination...
[INFO] Count isFraud with group by ProductCD-DT_D combination...
[INFO] Count TransactionAmt with group by ProductCD-DT_D combination...
[INFO] Count isFraud with group by ProductCD-DT_D combination...
[INFO] Count TransactionAmt with group by ProductCD-DT_D combination...


**Use card features to create unique id**

open card(開卡)：第一筆消費的產生，為日期 - D1

In [11]:
train_df['open_card'] = train_df.DT_D - train_df.D1
train_df['first_tran'] = train_df.DT_D - train_df.D2

test_df['open_card'] = test_df.DT_D - test_df.D1
test_df['first_tran'] = test_df.DT_D - test_df.D2

train_df['uid1'] = train_df.card1.astype(str) + '_' + train_df.card2.astype(str) + '_' + train_df.card3.astype(str) \
+ '_' + train_df.card4.astype(str) + '_' + train_df.card5.astype(str) + '_' + train_df.card6.astype(str) \
+ '_' + train_df.addr1.astype(str) + '_' + train_df.addr2.astype(str) + '_' + train_df.open_card.astype(str)

test_df['uid1'] = test_df.card1.astype(str) + '_' + test_df.card2.astype(str) + '_' + test_df.card3.astype(str) \
+ '_' + test_df.card4.astype(str) + '_' + test_df.card5.astype(str) + '_' + test_df.card6.astype(str) \
+ '_' + test_df.addr1.astype(str) + '_' + test_df.addr2.astype(str) + '_' + test_df.open_card.astype(str)


In [19]:
print('Unique uid of train data:{}'.format(train_df['uid1'].nunique()))
print('Unique uid of train data:{}'.format(test_df['uid1'].nunique()))

Unique uid of train data:222518
Unique uid of train data:198011


In [22]:
# Identify a device using id_30, id_31, id_32, id_33, DeviceType, DeviceInfo
import hashlib
def device_hash(x):
    s =  str(x['id_30']) + str(x['id_31']) + str(x['id_32']) + str(x['id_33']) + str(x['DeviceType']) \
        + str(x['DeviceInfo'])
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h

In [23]:
for df in [train_df, test_df]:
    df['device_hash'] = df.apply(lambda x: device_hash(x), axis=1)

In [24]:
# Number of devices with the same user
concat_df = pd.concat([train_df[['uid1', 'device_hash']], test_df[['uid1', 'device_hash']]])
tmp = concat_df.groupby('uid1')['device_hash'].agg(['nunique'])
train_df['uid_device_nunique'] = train_df.uid1.map(tmp.to_dict()['nunique'])
test_df['uid_device_nunique'] = train_df.uid1.map(tmp.to_dict()['nunique'])
del concat_df


In [30]:
# 得到金額不為零小數位的個數
# get number of non-zero decimal digits
def change(hoge):
    hoge = np.round(hoge, 3)
    num = 3
    hoge = int(np.round(np.round(hoge,3)*1000))
    while(hoge % 10 == 0):
        num = num-1
        hoge = hoge /10
    if num < 0:
        num = 0
    return num
  
train_df['decimal_digit'] = train_df["TransactionAmt"].map(change)
test_df['decimal_digit'] = test_df['TransactionAmt'].map(change)

gc.collect()


49

In [35]:
# 没有 identity 填充為0
train_df.had_id = train_df.had_id.fillna(0)
test_df.had_id = test_df.had_id.fillna(0)

In [36]:
### D系列数据有随時間增加的趋势，未来的数据大于过去的数据，所以进行缩放，将相对关系保留
# D series data has a trend of increasing with time. Future data is larger than past data, 
# so zoom in and keep the relative relationship.
for t in ['D1', 'D2', 'D4', 'D6', 'D10', 'D11', 'D12', 'D14', 'D15']:
    train_df[t + '_revised'] = train_df[t] / train_df.groupby('DT_W')[t].transform('max')
    test_df[t + '_revised'] = test_df[t] / test_df.groupby('DT_W')[t].transform('max')
for t in ['D3','D5','D7','D8','D13']:
    train_df[t + '_revised'] = train_df[t] / train_df.groupby('DT_M')[t].transform('max')
    test_df[t + '_revised'] = test_df[t] / test_df.groupby('DT_M')[t].transform('max')

In [49]:
### 对时间进行细分，周内第几天和当天时间小时
# Subdivide the time into days of the week and hours of the day.
train_df['dow'] = train_df['TransactionDT'].dt.dayofweek
train_df['hour'] = train_df['TransactionDT'].dt.hour
test_df['dow'] = test_df['TransactionDT'].dt.dayofweek
test_df['hour'] = test_df['TransactionDT'].dt.hour
train_df['email_domain_comp'] = (train_df['P_emaildomain'].values == train_df['R_emaildomain'].values).astype(int)
test_df['email_domain_comp'] = (test_df['P_emaildomain'].values == test_df['R_emaildomain'].values).astype(int)
train_df.drop(['D9'],axis=1,inplace=True)
test_df.drop(['D9'],axis=1,inplace=True)


In [54]:
# 类别变量，需要进行LabelEncoder
cat_columns = ['uid1','id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 
               'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32',
               'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 
               'M4','P_emaildomain', 'R_emaildomain', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 
               'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'hour', 'dow','device_name', 
               'device_version', 'OS_id_30', 'browser_id_31']

# 进行count encoding的
count_columns = ['uid1', 'id_13', 'id_14', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_24', 'id_25',
                 'id_26', 'id_30', 'id_31', 'id_33', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain', 
                 'card1', 'card2','card3','card5', 'card6', 'addr1','addr2','hour','device_version',
                 'OS_id_30', 'browser_id_31']

In [55]:
for f in cat_columns:
    #if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
    lbl = LabelEncoder()
    lbl.fit(list(train_df[f].astype(str)) + list(test_df[f].astype(str)))
    train_df[f] = lbl.transform(list(train_df[f].astype(str)))
    test_df[f] = lbl.transform(list(test_df[f].astype(str)))

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

LabelEncoder()

In [56]:
train_df.fillna(-999,inplace = True)
test_df.fillna(-999,inplace = True)

In [58]:
for i in count_columns:
    train_df[i+'_count_full'] = train_df[i].map(pd.concat([train_df[i], 
                                                           test_df[i]], ignore_index=True).value_counts(dropna=False))
    test_df[i+'_count_full'] = test_df[i].map(pd.concat([train_df[i], 
                                                         test_df[i]], ignore_index=True).value_counts(dropna=False))
    

In [60]:
# Calculate the number of transactions in natural hours and days
train_len = len(train_df)
train_test_all = pd.concat([train_df[['TransactionDT', 'TransactionAmt']],
                            test_df[['TransactionDT', 'TransactionAmt']]],ignore_index=True,sort=False)
train_test_all['day_count'] = train_test_all.groupby(train_test_all.TransactionDT.dt.date)['TransactionAmt'].transform('count')
train_test_all['hour_count'] = train_test_all.groupby(train_test_all.TransactionDT.map(lambda x:str(x)[:13]))['TransactionAmt'].transform('count')
train_df['day_count'] = train_test_all[:train_len].day_count.tolist()
test_df['day_count'] = train_test_all[train_len:].day_count.tolist()
train_df['hour_count'] = train_test_all[:train_len].hour_count.tolist()
test_df['hour_count'] = train_test_all[train_len:].hour_count.tolist()


In [None]:
### 按照价格个类别确定商品id
# Identify commodity ID by price category
temp123 = ['TransactionAmt__ProductCD']
for feature in temp123:
    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)
    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))
train.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
test.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
for i in ['ProductID']:
    train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
    test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))