In [109]:
#### Libraries Installation
!pip install outlier_utils
!pip install imbalanced-learn 
!pip install delayed
!pip install missingno
!pip install plotly
!pip install lightgbm
!pip install xgboost
!pip install category_encoders

Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# allows us to implement a train test split strategy
from sklearn.model_selection import train_test_split

# access to the logistic regression class of Sklearn
from sklearn.linear_model import LogisticRegression

# access to the neural network model MLP Classifier
from sklearn.neural_network import MLPClassifier


In [83]:
# Read Training Set
df = pd.read_csv('train_sample.csv',parse_dates=['click_time'])

In [84]:
df['day'] = df['click_time'].dt.day.astype('uint8')
df['hour'] = df['click_time'].dt.hour.astype('uint8')
df['minute'] = df['click_time'].dt.minute.astype('uint8')
df['second'] = df['click_time'].dt.second.astype('uint8')

In [85]:
df['minute'] = df['minute'].apply(lambda x: '{0:0>2}'.format(x))

In [86]:
df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,87540,12,1,13,497,2017-11-07 09:30:38,,0,7,9,30,38
1,105560,25,1,17,259,2017-11-07 13:40:27,,0,7,13,40,27
2,101424,12,1,19,212,2017-11-07 18:05:24,,0,7,18,5,24
3,94584,13,1,13,477,2017-11-07 04:58:08,,0,7,4,58,8
4,68413,12,1,1,178,2017-11-09 09:00:09,,0,9,9,0,9


In [87]:
click_outlier = 31
anomaly = 48

In [88]:
# Transform

ranges = [-1, 14, 29, 44, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4']
df['seconds_15'] = pd.cut(df['second'], 
                                  bins=ranges, 
                                  labels=labels)
df['hm_seconds_15'] = df['hour'].astype(str) + df['minute'].astype(str) + df['seconds_15'].astype(str)
df['hm_seconds_15'] = df['hm_seconds_15'].astype('int32')

conversion_seconds_15 = df.groupby(
    'hm_seconds_15'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
)
conversion_seconds_15['ratio'] = conversion_seconds_15['click_time']/conversion_seconds_15['is_attributed']
conversion_seconds_15['conversion_rate'] = conversion_seconds_15['is_attributed']/conversion_seconds_15['click_time']
conversion_seconds_15.reset_index(level=0, inplace=True)

conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] == 0) & (conversion_seconds_15['click_time'] > click_outlier), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] != 0) & (conversion_seconds_15['ratio'] > anomaly), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['isFraud_15'].isnull()), 'isFraud_15'] = 0

df = pd.merge(df, conversion_seconds_15[['hm_seconds_15', 'isFraud_15']], on='hm_seconds_15', how='left')

df['minute'] = df['click_time'].dt.minute.astype('uint8')

df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,seconds_15,hm_seconds_15,isFraud_15
0,87540,12,1,13,497,2017-11-07 09:30:38,,0,7,9,30,38,3,9303,0.0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0,7,13,40,27,2,13402,0.0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0,7,18,5,24,2,18052,0.0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0,7,4,58,8,1,4581,0.0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0,9,9,0,9,1,9001,0.0


In [89]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3)

In [90]:
len(train)

70000

In [91]:
len(test)

30000

In [92]:
train.isFraud_15.sum()/len(train)

0.041928571428571426

In [93]:
test.isFraud_15.sum()/len(test)

0.042866666666666664

## Prep For Modelling

In [94]:
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,seconds_15,hm_seconds_15,isFraud_15
2150,221238,12,1,13,409,2017-11-08 03:43:39,,0,8,3,43,39,3,3433,0.0
22348,319641,14,1,16,483,2017-11-09 06:16:54,,0,9,6,16,54,4,6164,0.0
53165,2919,12,1,13,178,2017-11-09 09:38:27,,0,9,9,38,27,2,9382,0.0
69435,81896,9,2,6,466,2017-11-08 13:30:34,,0,8,13,30,34,3,13303,0.0
7817,59426,3,1,8,409,2017-11-09 03:39:11,,0,9,3,39,11,1,3391,0.0


In [95]:
train = train.drop(["click_time","attributed_time", "is_attributed", "hm_seconds_15"], axis=1)
test = test.drop(["click_time","attributed_time", "is_attributed", "hm_seconds_15"], axis=1)

train.head()

Unnamed: 0,ip,app,device,os,channel,day,hour,minute,second,seconds_15,isFraud_15
2150,221238,12,1,13,409,8,3,43,39,3,0.0
22348,319641,14,1,16,483,9,6,16,54,4,0.0
53165,2919,12,1,13,178,9,9,38,27,2,0.0
69435,81896,9,2,6,466,8,13,30,34,3,0.0
7817,59426,3,1,8,409,9,3,39,11,1,0.0


In [96]:
import category_encoders as ce
cat_features = ['ip', 'app', 'device', 'os', 'channel']
count_enc = ce.CountEncoder(cols=cat_features)
count_enc.fit(train[cat_features])

CountEncoder(cols=['ip', 'app', 'device', 'os', 'channel'],
             combine_min_nan_groups=True)

In [97]:
train = train.join(count_enc.transform(train[cat_features]).add_suffix("_count"))
test = test.join(count_enc.transform(test[cat_features]).add_suffix("_count"))

In [98]:
feature_cols = ['day', 'hour', 'minute', 'second', 
                'ip_count', 'app_count', 'device_count',
                'os_count', 'channel_count']

In [101]:
from imblearn.over_sampling import SMOTE

y_train = train['isFraud_15']
X_train = train[feature_cols]

# Perform SMOTE on training data
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [103]:
from collections import Counter

# Check oversampled target distibution
Counter(y)

Counter({0.0: 67065, 1.0: 67065})

## Logistic Regression Model

In [131]:
# Base model using all variables
logreg_model = LogisticRegression(random_state=280)

logreg_model.fit(X_train, y_train)

LogisticRegression(random_state=280)

In [142]:
# classification metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score


# Prediction on test data
y_pred_lr = logreg_model.predict(test[feature_cols].fillna(0))
predictions_lr = [round(value) for value in y_pred_lr]

# Evaluation of Baseline Logistic Regression

print("accuracy: ",accuracy_score(test['isFraud_15'],predictions_lr)*100)
print("auc: ",roc_auc_score(test['isFraud_15'],predictions_lr)*100)
print("precision_score: ",precision_score(test['isFraud_15'],predictions_lr)*100)
print("recall_score: ",recall_score(test['isFraud_15'],predictions_lr)*100)

accuracy:  59.03
auc:  57.50263146463688
precision_score:  5.771240253998874
recall_score:  55.83203732503888


## LightGBM Model

In [121]:
import lightgbm as lgb

dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(test[feature_cols], label=test['isFraud_15'])

param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 12345}
num_round = 100
bst = lgb.train(param, dtrain, num_round, valid_sets=[dtest], early_stopping_rounds=3)

[LightGBM] [Info] Number of positive: 67065, number of negative: 67065
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 134130, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[1]	valid_0's auc: 0.717093
Training until validation scores don't improve for 3 rounds
[2]	valid_0's auc: 0.723876
[3]	valid_0's auc: 0.725076
[4]	valid_0's auc: 0.726153
[5]	valid_0's auc: 0.731128
[6]	valid_0's auc: 0.732206
[7]	valid_0's auc: 0.731759
[8]	valid_0's auc: 0.733471
[9]	valid_0's auc: 0.734546
[10]	valid_0's auc: 0.735611
[11]	valid_0's auc: 0.736366
[12]	valid_0's auc: 0.736987
[13]	valid_0's auc: 0.736928
[14]	valid_0's auc: 0.735934
[15]	valid_0's auc: 0.736742
Early stopping, best iteration is:
[12]	valid_0's auc: 0.736987


In [143]:
from sklearn import metrics

ypred_lgb = bst.predict(test[feature_cols])
predictions_lgb = [round(value) for value in ypred_lgb]
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# evaluate predictions

print("accuracy: ",accuracy_score(test['isFraud_15'],predictions_lgb)*100)
print("auc: ",roc_auc_score(test['isFraud_15'],predictions_lgb)*100)
print("precision_score: ",precision_score(test['isFraud_15'],predictions_lgb)*100)
print("recall_score: ",recall_score(test['isFraud_15'],predictions_lgb)*100)

accuracy:  69.71000000000001
auc:  65.68151711451304
precision_score:  8.403540578010023
recall_score:  61.27527216174183


## XGB Model

In [144]:
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=10, subsample=0.9, tree_method='hist', max_bin = 300)
model.fit(X_train, y_train)

# make predictions for test data

y_pred_xgb = model.predict(test[feature_cols])
predictions_xgb = [round(value) for value in y_pred_xgb]

# evaluate predictions

print("accuracy: ",accuracy_score(test['isFraud_15'],predictions_xgb)*100)
print("auc: ",roc_auc_score(test['isFraud_15'],predictions_xgb)*100)
print("precision_score: ",precision_score(test['isFraud_15'],predictions_xgb)*100)
print("recall_score: ",recall_score(test['isFraud_15'],predictions_xgb)*100)

accuracy:  89.87666666666667
auc:  62.88356907739555
precision_score:  16.443081640475278
recall_score:  33.3592534992224


## Neural Net

In [149]:
from sklearn.neural_network import MLPClassifier
nn_model = MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=300,activation = 'relu',solver='adam',random_state=280)
nn_model.fit(X_train, y_train)

y_pred_nn = nn_model.predict(test[feature_cols].fillna(0))
predictions_nn = [round(value) for value in y_pred_nn]

# evaluate predictions

print("accuracy: ",accuracy_score(test['isFraud_15'],predictions_nn)*100)
print("auc: ",roc_auc_score(test['isFraud_15'],predictions_nn)*100)
print("precision_score: ",precision_score(test['isFraud_15'],predictions_nn)*100)
print("recall_score: ",recall_score(test['isFraud_15'],predictions_nn)*100)

accuracy:  94.99
auc:  50.03066386135981
precision_score:  4.602510460251046
recall_score:  0.8553654743390358


Unnamed: 0,day,hour,minute,second,ip_count,app_count,device_count,os_count,channel_count
78114,7,9,14,23,3.0,6349.0,3066.0,536.0,1400.0
43344,6,18,37,2,4.0,5966.0,66020.0,16748.0,315.0
13706,9,11,52,10,9.0,9275.0,66020.0,14874.0,2051.0
16623,7,0,35,51,2.0,3815.0,66020.0,3363.0,1028.0
90663,8,9,1,59,2.0,9275.0,66020.0,16748.0,741.0
