In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
data = pd.read_csv("train_sample.csv",parse_dates=['click_time'])

In [6]:
data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,89489,3,1,13,379,2017-11-06 15:13:23,,0
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1
2,3437,6,1,13,459,2017-11-06 15:42:32,,0
3,167543,3,1,13,379,2017-11-06 15:56:17,,0
4,147509,3,1,13,379,2017-11-06 15:57:01,,0


In [8]:
clicks = data.copy()
# Split up the times
click_times = data['click_time']
clicks['day'] = click_times.dt.day.astype('uint8')
clicks['hour'] = click_times.dt.hour.astype('uint8')
clicks['minute'] = click_times.dt.minute.astype('uint8')
clicks['second'] = click_times.dt.second.astype('uint8')

In [10]:
from sklearn import preprocessing
cat_features = ['ip', 'app', 'device', 'os', 'channel']

label_encoder = preprocessing.LabelEncoder()
for feature in cat_features:
    encoded = label_encoder.fit_transform(clicks[feature])
    clicks[feature + '_labels'] = encoded

In [12]:
clicks.head(2)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,ip_labels,app_labels,device_labels,os_labels,channel_labels
0,89489,3,1,13,379,2017-11-06 15:13:23,,0,6,15,13,23,27226,3,1,13,120
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1,6,15,41,7,110007,35,1,13,10


In [13]:
feature_cols = ['day', 'hour', 'minute', 'second', 
                'ip_labels', 'app_labels', 'device_labels',
                'os_labels', 'channel_labels']

valid_fraction = 0.1
clicks_srt = clicks.sort_values('click_time')
valid_rows = int(len(clicks_srt) * valid_fraction)
train = clicks_srt[:-valid_rows * 2]
# valid size == test size, last two sections of the data
valid = clicks_srt[-valid_rows * 2:-valid_rows]
test = clicks_srt[-valid_rows:]

In [16]:
import lightgbm as lgb

dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
dtest = lgb.Dataset(test[feature_cols], label=test['is_attributed'])

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10)

[LightGBM] [Info] Number of positive: 363974, number of negative: 1476475
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1066
[LightGBM] [Info] Number of data points in the train set: 1840449, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.197764 -> initscore=-1.400330
[LightGBM] [Info] Start training from score -1.400330
[1]	valid_0's auc: 0.948979
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.949235
[3]	valid_0's auc: 0.950126
[4]	valid_0's auc: 0.950072
[5]	valid_0's auc: 0.950536
[6]	valid_0's auc: 0.950943
[7]	valid_0's auc: 0.951453
[8]	valid_0's auc: 0.951518
[9]	valid_0's auc: 0.952385
[10]	valid_0's auc: 0.952434
[11]	valid_0's auc: 0.952465
[12]	valid_0's auc: 0.952638
[13]	valid_0's auc: 0.95266
[14]	valid_0's auc: 0.952766
[15]	valid_0's auc: 0.953203
[16]	valid_0's auc: 0.953489
[17]	valid_0's auc: 0.953783
[18]	valid_0's auc: 0.953952
[19]	valid_0's auc: 0.954153
[20]

[262]	valid_0's auc: 0.962319
[263]	valid_0's auc: 0.962323
[264]	valid_0's auc: 0.962323
[265]	valid_0's auc: 0.962318
[266]	valid_0's auc: 0.962324
[267]	valid_0's auc: 0.96232
[268]	valid_0's auc: 0.962323
[269]	valid_0's auc: 0.962334
[270]	valid_0's auc: 0.962334
[271]	valid_0's auc: 0.962333
[272]	valid_0's auc: 0.962334
[273]	valid_0's auc: 0.962333
[274]	valid_0's auc: 0.962334
[275]	valid_0's auc: 0.962334
[276]	valid_0's auc: 0.962323
[277]	valid_0's auc: 0.962323
[278]	valid_0's auc: 0.962328
[279]	valid_0's auc: 0.962332
[280]	valid_0's auc: 0.962338
[281]	valid_0's auc: 0.962345
[282]	valid_0's auc: 0.962347
[283]	valid_0's auc: 0.962346
[284]	valid_0's auc: 0.962347
[285]	valid_0's auc: 0.962357
[286]	valid_0's auc: 0.962352
[287]	valid_0's auc: 0.962354
[288]	valid_0's auc: 0.962357
[289]	valid_0's auc: 0.962365
[290]	valid_0's auc: 0.962368
[291]	valid_0's auc: 0.96237
[292]	valid_0's auc: 0.962372
[293]	valid_0's auc: 0.962376
[294]	valid_0's auc: 0.962376
[295]	valid_

In [18]:
import sklearn.metrics as metrics
ypred = bst.predict(test[feature_cols])
score = metrics.roc_auc_score(test['is_attributed'], ypred)
print(f"Test score: {score}")

Test score: 0.9728934889952952
