#### Libraries Installation
!pip install outlier_utils
!pip install imbalanced-learn 
!pip install delayed
!pip install missingno
!pip install plotly
!pip install lightgbm
!pip install xgboost
!pip install category_encoders

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# allows us to implement a train test split strategy
from sklearn.model_selection import train_test_split

# access to the logistic regression class of Sklearn
from sklearn.linear_model import LogisticRegression

# access to the neural network model MLP Classifier
from sklearn.neural_network import MLPClassifier

# classification metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [2]:
# Read Training Set
df = pd.read_csv('train.csv',dtype={
                             'ip': np.uint8,
                             'app': np.uint8,
                             'device' : np.uint8,
                             'os' : np.uint8,
                             'channel' : np.uint8,
                             'is_attributed' : np.uint8    
                        },parse_dates=['click_time'])

In [3]:
df['day'] = df['click_time'].dt.day.astype('uint8')
# Get oot
oot = df[df['day']==8]

# Get Train
df = df[df['day']==7]

In [4]:
df['hour'] = df['click_time'].dt.hour.astype('uint8')
df['minute'] = df['click_time'].dt.minute.astype('uint8')
df['second'] = df['click_time'].dt.second.astype('uint8')

#Edit minute variable to two-digit string
df['minute'] = df['minute'].apply(lambda x: '{0:0>2}'.format(x))

In [5]:
click_outlier = 1250.0
ratio_outlier = 939.8804563492063

In [6]:
# Transform

ranges = [-1, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
df['seconds_grp'] = pd.cut(df['second'], 
                                  bins=ranges, 
                                  labels=labels)
df['interval'] = df['hour'].astype(str) + df['minute'].astype(str) + df['seconds_grp'].astype(str)
#df['interval'] = df['interval'].astype('uint32')

df_interval = df.groupby(
    'interval'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
).rename(
    columns = {
        'is_attributed': 'total_dl', 
        'click_time': 'total_clicks'
    }, 
    inplace = False)

df_interval['conversion_rate'] = df_interval['total_dl']/df_interval['total_clicks']
df_interval['ratio'] = df_interval['total_clicks']/df_interval['total_dl']
df_interval.reset_index(level=0, inplace=True)

df_interval.loc[(df_interval['total_dl'] == 0) & (df_interval['total_clicks'] > click_outlier), 'isFraud'] = 1
df_interval.loc[(df_interval['total_dl'] != 0) & (df_interval['ratio'] > ratio_outlier), 'isFraud'] = 1
df_interval.loc[(df_interval['isFraud'].isnull()), 'isFraud'] = 0

df = pd.merge(df, df_interval[['interval', 'isFraud']], on='interval', how='left')

df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,seconds_grp,interval,isFraud
0,56,2,1,32,237,2017-11-07,,0,7,0,0,0,1,1,0.0
1,68,3,1,25,168,2017-11-07,,0,7,0,0,0,1,1,0.0
2,142,8,2,13,145,2017-11-07,,0,7,0,0,0,1,1,0.0
3,219,2,1,13,122,2017-11-07,,0,7,0,0,0,1,1,0.0
4,167,15,1,8,245,2017-11-07,,0,7,0,0,0,1,1,0.0


In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3)

In [8]:
print(train.isFraud.sum()/len(train))
print(test.isFraud.sum()/len(test))

0.056153563455438864
0.0560960532516698


### Prep for Modelling

In [9]:
train = train.drop(["click_time","attributed_time", "is_attributed", "interval"], axis=1)
test = test.drop(["click_time","attributed_time", "is_attributed", "interval"], axis=1)

train.head()

Unnamed: 0,ip,app,device,os,channel,day,hour,minute,second,seconds_grp,isFraud
16732674,114,2,1,13,237,7,4,57,36,8,0.0
42364376,110,8,1,13,145,7,13,23,6,2,0.0
40989418,65,2,1,25,179,7,12,57,0,1,0.0
11686202,107,2,1,47,237,7,3,31,44,9,0.0
7642284,159,29,1,13,213,7,2,13,51,11,0.0


In [10]:
import category_encoders as ce
cat_features = ['ip', 'app', 'device', 'os', 'channel']
count_enc = ce.CountEncoder(cols=cat_features)
count_enc.fit(train[cat_features])

train = train.join(count_enc.transform(train[cat_features]).add_suffix("_count"))
test = test.join(count_enc.transform(test[cat_features]).add_suffix("_count"))

In [11]:
feature_cols = ['ip_count', 'app_count', 'device_count',
                'os_count', 'channel_count']

In [12]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

y_train = train['isFraud']
X_train = train[feature_cols]

X_train, y_train = rus.fit_resample(X_train, y_train)

from collections import Counter

Counter(y_train)

Counter({0.0: 2344036, 1.0: 2344036})

### Logistic Regression

In [13]:
# Base model using all variables
logreg_model = LogisticRegression(random_state=280)

logreg_model.fit(X_train, y_train)

# Prediction on test data
y_pred_lr = logreg_model.predict(test[feature_cols].fillna(0))
predictions_lr = [round(value) for value in y_pred_lr]

# Evaluation of Baseline Logistic Regression

print("accuracy: ",accuracy_score(test['isFraud'],predictions_lr)*100)
print("auc: ",roc_auc_score(test['isFraud'],predictions_lr)*100)
print("precision_score: ",precision_score(test['isFraud'],predictions_lr)*100)
print("recall_score: ",recall_score(test['isFraud'],predictions_lr)*100)

accuracy:  51.38589489666094
auc:  51.678253363635804
precision_score:  5.973501817598326
recall_score:  52.00755711179623


### LightGBM

In [14]:
import lightgbm as lgb

dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(test[feature_cols], label=test['isFraud'])

param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 12345}
num_round = 100
lgbm_model = lgb.train(param, dtrain, num_round, valid_sets=[dtest], early_stopping_rounds=3)

ypred_lgb = lgbm_model.predict(test[feature_cols])
predictions_lgb = [round(value) for value in ypred_lgb]
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# evaluate predictions

print("accuracy: ",accuracy_score(test['isFraud'],predictions_lgb)*100)
print("auc: ",roc_auc_score(test['isFraud'],predictions_lgb)*100)
print("precision_score: ",precision_score(test['isFraud'],predictions_lgb)*100)
print("recall_score: ",recall_score(test['isFraud'],predictions_lgb)*100)

[LightGBM] [Info] Number of positive: 2344036, number of negative: 2344036
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 678
[LightGBM] [Info] Number of data points in the train set: 4688072, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[1]	valid_0's auc: 0.582015
Training until validation scores don't improve for 3 rounds
[2]	valid_0's auc: 0.583414
[3]	valid_0's auc: 0.583451
[4]	valid_0's auc: 0.584503
[5]	valid_0's auc: 0.584726
[6]	valid_0's auc: 0.584834
[7]	valid_0's auc: 0.585471
[8]	valid_0's auc: 0.586347
[9]	valid_0's auc: 0.586666
[10]	valid_0's auc: 0.587099
[11]	valid_0's auc: 0.587274
[12]	valid_0's auc: 0.587306
[13]	valid_0's auc: 0.58733
[14]	valid_0's auc: 0.587347
[15]	valid_0's auc: 0.587598
[16]	valid_0's auc: 0.587734
[17]	valid_0's auc: 0.587831
[18]	valid_0's auc: 0.587852
[19]	valid_0's auc: 0.587

In [18]:
import joblib
# save model
joblib.dump(lgbm_model, 'lgb_5s.pkl')
# load model
gbm_pickle = joblib.load('lgb_5s.pkl')

### XGBoost

In [15]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(max_depth=10, subsample=0.9, tree_method='hist', max_bin = 300)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(test[feature_cols])
predictions_xgb = [round(value) for value in y_pred_xgb]

# evaluate predictions

print("accuracy: ",accuracy_score(test['isFraud'],predictions_xgb)*100)
print("auc: ",roc_auc_score(test['isFraud'],predictions_xgb)*100)
print("precision_score: ",precision_score(test['isFraud'],predictions_xgb)*100)
print("recall_score: ",recall_score(test['isFraud'],predictions_xgb)*100)

accuracy:  61.839247226088915
auc:  56.4427993598023
precision_score:  7.395610650442549
recall_score:  50.36440345251595


In [20]:
import joblib
# save model
joblib.dump(xgb_model, 'xgb_5s.pkl')
# load model
gbm_pickle = joblib.load('xgb_5s.pkl')

### Neural Net

### Out of Time Sample

In [None]:
lgbm_model = joblib.load('lgb_5s.pkl')

oot['hour'] = oot['click_time'].dt.hour.astype('uint8')
oot['minute'] = oot['click_time'].dt.minute.astype('uint8')
oot['second'] = oot['click_time'].dt.second.astype('uint8')

#Edit minute variable to two-digit string
oot['minute'] = oot['minute'].apply(lambda x: '{0:0>2}'.format(x))

# Transform

ranges = [-1, 14, 29, 44, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4']
oot['seconds_grp'] = pd.cut(oot['second'], 
                                  bins=ranges, 
                                  labels=labels)
oot['interval'] = oot['hour'].astype(str) + oot['minute'].astype(str) + oot['seconds_grp'].astype(str)
#df['interval'] = df['interval'].astype('uint32')

oot_interval = oot.groupby(
    'interval'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
).rename(
    columns = {
        'is_attributed': 'total_dl', 
        'click_time': 'total_clicks'
    }, 
    inplace = False)

oot_interval['conversion_rate'] = oot_interval['total_dl']/oot_interval['total_clicks']
oot_interval['ratio'] = oot_interval['total_clicks']/oot_interval['total_dl']
oot_interval.reset_index(level=0, inplace=True)

oot_interval.loc[(oot_interval['total_dl'] == 0) & (oot_interval['total_clicks'] > click_outlier), 'isFraud'] = 1
oot_interval.loc[(oot_interval['total_dl'] != 0) & (oot_interval['ratio'] > ratio_outlier), 'isFraud'] = 1
oot_interval.loc[(oot_interval['isFraud'].isnull()), 'isFraud'] = 0

oot = pd.merge(df, oot_interval[['interval', 'isFraud']], on='interval', how='left')

oot = test.drop(["click_time","attributed_time", "is_attributed", "interval"], axis=1)

oot = oot.join(count_enc.transform(oot[cat_features]).add_suffix("_count"))


In [None]:
# Evaluation of Baseline Logistic Regression

print("accuracy: ",accuracy_score(oot['isFraud'],predictions_lr)*100)
print("auc: ",roc_auc_score(oot['isFraud'],predictions_lr)*100)
print("precision_score: ",precision_score(oot['isFraud'],predictions_lr)*100)
print("recall_score: ",recall_score(oot['isFraud'],predictions_lr)*100)

In [None]:
# Evaluation of LightGBM

print("accuracy: ",accuracy_score(oot['isFraud'],predictions_lgb)*100)
print("auc: ",roc_auc_score(oot['isFraud'],predictions_lgb)*100)
print("precision_score: ",precision_score(oot['isFraud'],predictions_lgb)*100)
print("recall_score: ",recall_score(oot['isFraud'],predictions_lgb)*100)

In [None]:
# Evaluation of XGBoost

print("accuracy: ",accuracy_score(oot['isFraud'],predictions_xgb)*100)
print("auc: ",roc_auc_score(oot['isFraud'],predictions_xgb)*100)
print("precision_score: ",precision_score(oot['isFraud'],predictions_xgb)*100)
print("recall_score: ",recall_score(oot['isFraud'],predictions_xgb)*100)