In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# allows us to implement a train test split strategy
from sklearn.model_selection import train_test_split

# access to the logistic regression class of Sklearn
from sklearn.linear_model import LogisticRegression

# access to the neural network model MLP Classifier
from sklearn.neural_network import MLPClassifier


In [2]:
# Read Training Set
df = pd.read_csv('train.csv',parse_dates=['click_time'])

In [3]:
df['day'] = df['click_time'].dt.day.astype('uint8')
df['hour'] = df['click_time'].dt.hour.astype('uint8')
df['minute'] = df['click_time'].dt.minute.astype('uint8')
df['second'] = df['click_time'].dt.second.astype('uint8')

In [4]:
df = df[df['day']==7]

In [5]:
df['minute'] = df['minute'].apply(lambda x: '{0:0>2}'.format(x))

In [6]:
df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
9308568,70712,2,1,32,237,2017-11-07,,0,7,0,0,0
9308569,45892,3,1,25,424,2017-11-07,,0,7,0,0,0
9308570,37774,8,2,13,145,2017-11-07,,0,7,0,0,0
9308571,41179,2,1,13,122,2017-11-07,,0,7,0,0,0
9308572,83111,15,1,8,245,2017-11-07,,0,7,0,0,0


In [8]:
click_outlier = 14507.5
anomaly = 812.8342105263157

In [9]:
# Transform

ranges = [-1, 14, 29, 44, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4']
df['seconds_15'] = pd.cut(df['second'], 
                                  bins=ranges, 
                                  labels=labels)
df['hm_seconds_15'] = df['hour'].astype(str) + df['minute'].astype(str) + df['seconds_15'].astype(str)
df['hm_seconds_15'] = df['hm_seconds_15'].astype('int32')

conversion_seconds_15 = df.groupby(
    'hm_seconds_15'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
)
conversion_seconds_15['ratio'] = conversion_seconds_15['click_time']/conversion_seconds_15['is_attributed']
conversion_seconds_15['conversion_rate'] = conversion_seconds_15['is_attributed']/conversion_seconds_15['click_time']
conversion_seconds_15.reset_index(level=0, inplace=True)

conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] == 0) & (conversion_seconds_15['click_time'] > click_outlier), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] != 0) & (conversion_seconds_15['ratio'] > anomaly), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['isFraud_15'].isnull()), 'isFraud_15'] = 0

df = pd.merge(df, conversion_seconds_15[['hm_seconds_15', 'isFraud_15']], on='hm_seconds_15', how='left')

df['minute'] = df['click_time'].dt.minute.astype('uint8')

df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,seconds_15,hm_seconds_15,isFraud_15
0,70712,2,1,32,237,2017-11-07,,0,7,0,0,0,1,1,0.0
1,45892,3,1,25,424,2017-11-07,,0,7,0,0,0,1,1,0.0
2,37774,8,2,13,145,2017-11-07,,0,7,0,0,0,1,1,0.0
3,41179,2,1,13,122,2017-11-07,,0,7,0,0,0,1,1,0.0
4,83111,15,1,8,245,2017-11-07,,0,7,0,0,0,1,1,0.0


In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3)

In [11]:
len(train)

41743317

In [12]:
len(test)

17889993

In [13]:
# Set Thresholds
click_outlier = 14507.5
anomaly = 812.8342105263157

In [14]:
# Transform

ranges = [-1, 14, 29, 44, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4']
train['seconds_15'] = pd.cut(train['second'], 
                                  bins=ranges, 
                                  labels=labels)
train['hm_seconds_15'] = train['hour'].astype(str) + train['minute'].astype(str) + train['seconds_15'].astype(str)
train['hm_seconds_15'] = train['hm_seconds_15'].astype('int32')

conversion_seconds_15 = train.groupby(
    'hm_seconds_15'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
)
conversion_seconds_15['ratio'] = conversion_seconds_15['click_time']/conversion_seconds_15['is_attributed']
conversion_seconds_15['conversion_rate'] = conversion_seconds_15['is_attributed']/conversion_seconds_15['click_time']
conversion_seconds_15.reset_index(level=0, inplace=True)

conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] == 0) & (conversion_seconds_15['click_time'] > click_outlier), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] != 0) & (conversion_seconds_15['ratio'] > anomaly), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['isFraud_15'].isnull()), 'isFraud_15'] = 0

train = pd.merge(train, conversion_seconds_15[['hm_seconds_15', 'isFraud_15']], on='hm_seconds_15', how='left')

train['minute'] = train['click_time'].dt.minute.astype('uint8')

train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,seconds_15,hm_seconds_15,isFraud_15_x,isFraud_15_y
0,45336,9,1,22,215,2017-11-07 19:53:29,,0,7,19,53,29,2,19532,0.0,1.0
1,5648,9,1,10,232,2017-11-07 04:37:27,,0,7,4,37,27,2,4372,0.0,0.0
2,32523,20,1,19,259,2017-11-07 20:36:59,,0,7,20,36,59,4,20364,0.0,0.0
3,118183,12,1,19,219,2017-11-07 12:18:14,,0,7,12,18,14,1,12181,0.0,0.0
4,8557,12,1,27,124,2017-11-07 11:35:58,,0,7,11,35,58,4,11354,0.0,0.0


In [15]:
train.isFraud_15.sum()/len(train)

AttributeError: 'DataFrame' object has no attribute 'isFraud_15'

In [None]:
# Transform

ranges = [-1, 14, 29, 44, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4']
test['seconds_15'] = pd.cut(test['second'], 
                                  bins=ranges, 
                                  labels=labels)
test['hm_seconds_15'] = test['hour'].astype(str) + test['minute'].astype(str) + test['seconds_15'].astype(str)
test['hm_seconds_15'] = test['hm_seconds_15'].astype('int32')

conversion_seconds_15 = test.groupby(
    'hm_seconds_15'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
)
conversion_seconds_15['ratio'] = conversion_seconds_15['click_time']/conversion_seconds_15['is_attributed']
conversion_seconds_15['conversion_rate'] = conversion_seconds_15['is_attributed']/conversion_seconds_15['click_time']
conversion_seconds_15.reset_index(level=0, inplace=True)

conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] == 0) & (conversion_seconds_15['click_time'] > click_outlier), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] != 0) & (conversion_seconds_15['ratio'] > anomaly), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['isFraud_15'].isnull()), 'isFraud_15'] = 0

test = pd.merge(test, conversion_seconds_15[['hm_seconds_15', 'isFraud_15']], on='hm_seconds_15', how='left')

test['minute'] = test['click_time'].dt.minute.astype('uint8')

test.head()

In [None]:
test.isFraud_15.sum()/len(test)

## Prep For Modelling

In [None]:
train.head()

In [None]:
train = train.drop(["click_time","attributed_time", "is_attributed", "hm_seconds_15", "day"], axis=1)
test = test.drop(["click_time","attributed_time", "is_attributed", "hm_seconds_15", "day"], axis=1)

train.head()

In [None]:
import category_encoders as ce
cat_features = ['ip', 'app', 'device', 'os', 'channel']
count_enc = ce.CountEncoder(cols=cat_features)
count_enc.fit(train[cat_features])

In [None]:
train_encoded = train.join(count_enc.transform(train[cat_features]).add_suffix("_count"))
test_encoded = test.join(count_enc.transform(test[cat_features]).add_suffix("_count"))

In [None]:
train_encoded.head()

In [None]:
feature_cols = ['day', 'hour', 'minute', 'second', 
                'ip_labels', 'app_labels', 'device_labels',
                'os_labels', 'channel_labels']

## LightGBM Model

In [None]:
import lightgbm as lgb

dtrain = lgb.Dataset(train[feature_cols], label=train['isFraud_15'])
dtest = lgb.Dataset(test[feature_cols], label=test['isFraud_15'])

param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 12345}
num_round = 100
bst = lgb.train(param, dtrain, num_round, valid_sets=[dtest], early_stopping_rounds=3)

In [None]:
from sklearn import metrics

ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['isFraud_15'], ypred)
print(f"ROC AUC: {score}")
score = metrics.auc(test['isFraud_15'], ypred)
print(f"AUC: {score}")
score = metrics.precision_score(test['isFraud_15'], ypred)
print(f"Precision Score: {score}")
score = metrics.recall_score(test['isFraud_15'], ypred)
print(f"Recall Score: {score}")

In [None]:

ypred = bst.predict(valid[feature_cols])
score = metrics.roc_auc_score(valid['isFraud_15'], ypred)
print(f"ROC AUC: {score}")
score = metrics.auc(valid['isFraud_15'], ypred)
print(f"AUC: {score}")
score = metrics.precision_score(valid['isFraud_15'], ypred)
print(f"Precision Score: {score}")
score = metrics.recall_score(valid['isFraud_15'], ypred)
print(f"Recall Score: {score}")

## Validation for day 8

valid['minute'] = valid['minute'].apply(lambda x: '{0:0>2}'.format(x))

ranges = [-1, 14, 29, 44, np.inf]  # np.inf for infinity
labels = ['1', '2', '3', '4']
valid['seconds_15'] = pd.cut(valid['second'], 
                                  bins=ranges, 
                                  labels=labels)
valid['hm_seconds_15'] = valid['hour'].astype(str) + train['minute'].astype(str) + train['seconds_15'].astype(str)
valid['hm_seconds_15'] = valid['hm_seconds_15'].astype('int32')

conversion_seconds_15 = valid.groupby(
    'hm_seconds_15'
).aggregate(
    {
        'is_attributed':sum,
        'click_time':"count"
    }
)
conversion_seconds_15['ratio'] = conversion_seconds_15['click_time']/conversion_seconds_15['is_attributed']
conversion_seconds_15['conversion_rate'] = conversion_seconds_15['is_attributed']/conversion_seconds_15['click_time']
conversion_seconds_15.reset_index(level=0, inplace=True)

conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] == 0) & (conversion_seconds_15['click_time'] > click_outlier), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['is_attributed'] != 0) & (conversion_seconds_15['ratio'] > anomaly), 'isFraud_15'] = 1
conversion_seconds_15.loc[(conversion_seconds_15['isFraud_15'].isnull()), 'isFraud_15'] = 0

valid_data = pd.merge(valid, conversion_seconds_15[['hm_seconds_15', 'isFraud_15']], on='hm_seconds_15', how='left')

valid_data['minute'] = valid_data['click_time'].dt.minute.astype('uint8')

from sklearn import metrics

ypred = bst.predict(valid[feature_cols])
score = metrics.roc_auc_score(valid['isFraud_15'], ypred)
print(f"Valid score: {score}")