Using lightgbm and simple date features, this script scores ~ 0.674 on Public LB.

### Load libraries and data

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [17]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
print(train.shape , test.shape)

(12137810, 10) (3706907, 9)


In [5]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click
0,IDsrk7SoW,2017-01-14 09:42:09,4709696.0,887235,17714,20301556,e,Firefox,,0
1,IDmMSxHur,2017-01-18 17:50:53,5189467.0,178235,21407,9434818,b,Mozilla Firefox,Desktop,0
2,IDVLNN0Ut,2017-01-11 12:46:49,98480.0,518539,25085,2050923,a,Edge,,0
3,ID32T6wwQ,2017-01-17 10:18:43,8896401.0,390352,40339,72089744,c,Firefox,Mobile,0
4,IDqUShzMg,2017-01-14 16:02:33,5635120.0,472937,12052,39507200,d,Mozilla Firefox,Desktop,0


In [6]:
# check missing values per column
train.isnull().sum(axis=0)/train.shape[0]

ID             0.000000
datetime       0.000000
siteid         0.099896
offerid        0.000000
category       0.000000
merchant       0.000000
countrycode    0.000000
browserid      0.050118
devid          0.149969
click          0.000000
dtype: float64

### Clean Data and Create Features

In [7]:
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None", inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None", inplace=True)
test['devid'].fillna("None", inplace=True)

In [8]:
# set datatime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [9]:
# create datetime variable
train['tweekday'] = train['datetime'].dt.weekday
train['thour'] = train['datetime'].dt.hour
train['tminute'] = train['datetime'].dt.minute

test['tweekday'] = test['datetime'].dt.weekday
test['thour'] = test['datetime'].dt.hour
test['tminute'] = test['datetime'].dt.minute

In [10]:
cols = ['siteid','offerid','category','merchant']

for x in cols:
    train[x] = train[x].astype('object')
    test[x] = test[x].astype('object')

In [11]:
cat_cols = cols + ['countrycode','browserid','devid']

In [12]:
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values) + list(test[col].values))
    train[col] = lbl.transform(list(train[col].values))
    test[col] = lbl.transform(list(test[col].values))

In [13]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,tweekday,thour,tminute
0,IDsrk7SoW,2017-01-14 09:42:09,128865,784773,48,127,4,2,2,0,5,9,42
1,IDmMSxHur,2017-01-18 17:50:53,142053,157563,59,65,1,8,0,0,2,17,50
2,IDVLNN0Ut,2017-01-11 12:46:49,2618,458279,69,15,0,1,2,0,2,12,46
3,ID32T6wwQ,2017-01-17 10:18:43,243406,345067,117,507,2,2,1,0,1,10,18
4,IDqUShzMg,2017-01-14 16:02:33,154278,417948,36,276,3,8,0,0,5,16,2


In [None]:
# def hour_bu

In [21]:
cols_to_transform = ['devid','browserid','countrycode']

In [22]:
train = pd.get_dummies( data = train, columns = cols_to_transform )

In [24]:
test = pd.get_dummies( data = test, columns = cols_to_transform )

In [25]:
print (test.shape, train.shape)

(3706907, 31) (12137810, 32)


In [18]:
pd.unique(train.category)

array([ 48,  59,  69, 117,  36,  97, 187,  46, 249, 213, 125, 267, 200,
       258,  37, 120, 227,   3, 100,  32, 204, 121, 261, 221, 179,  65,
         6,  20,  82, 188, 151,  94, 260, 154, 226,  29, 192,   2, 169,
       269, 155, 123, 231, 266,  27, 224, 143, 254, 150, 238, 185, 116,
        96,  34,  57, 158,   8,  33, 139,  56,  73, 245, 118,  67, 103,
        17, 193, 268, 176, 108,  76, 164,  44, 126,  77, 132, 186, 244,
       173,  41,  47,  88, 205,  43,  12, 168, 109, 203,  40, 210, 130,
        91,  87, 222, 138, 199,  79, 137, 119, 161,  23,  26, 230, 174,
       180,  66, 202, 172, 165,  98, 248,  16, 255,  19, 216,  71,  11,
       197, 217,  31, 127, 223,  22, 209, 177, 243,  58, 225,  63,  95,
       220,  78,  15, 159, 239, 263, 232, 141, 235, 262,  80,  81,  64,
        21, 163, 149, 167, 234, 107,   9, 134, 128, 194, 110, 142, 144,
         4, 250,  38, 131, 206, 240,  74, 183, 247,  83, 257, 133, 148,
       265,  99, 111,  85,  54, 157, 102, 190,  24, 229,   7,  6

In [15]:
pd.unique(train.browserid)

array([ 2,  8,  1,  3,  9,  7,  0,  4,  6, 10,  5, 11], dtype=int64)

### Model Training

In [27]:
cols_to_use = list(set(train.columns) - set(['ID','datetime','click']))

In [28]:
X_train, X_test, y_train, y_test = train_test_split(train[cols_to_use], train['click'], test_size = 0.2)

In [29]:
print(X_train.shape, X_test.shape)

(9710248, 29) (2427562, 29)


In [30]:
dtrain = lgb.Dataset(X_train, y_train)
dval = lgb.Dataset(X_test, y_test)

In [31]:
params = {
    
    'num_leaves' : 256,
    'learning_rate':0.03,
    'metric':'auc',
    'objective':'binary',
    'early_stopping_round': 40,
    'max_depth':10,
    'bagging_fraction':0.5,
    'feature_fraction':0.6,
    'bagging_seed':2017,
    'feature_fraction_seed':2017,
    'verbose' : 1
    
    
}

In [32]:
clf = lgb.train(params, dtrain,num_boost_round=500,valid_sets=dval,verbose_eval=20)

[20]	valid_0's auc: 0.968797
[40]	valid_0's auc: 0.969668
[60]	valid_0's auc: 0.970003
[80]	valid_0's auc: 0.970335
[100]	valid_0's auc: 0.970489
[120]	valid_0's auc: 0.970719
[140]	valid_0's auc: 0.970948
[160]	valid_0's auc: 0.971157
[180]	valid_0's auc: 0.971365
[200]	valid_0's auc: 0.971621
[220]	valid_0's auc: 0.971902
[240]	valid_0's auc: 0.972146
[260]	valid_0's auc: 0.972395
[280]	valid_0's auc: 0.97264
[300]	valid_0's auc: 0.972857
[320]	valid_0's auc: 0.973218
[340]	valid_0's auc: 0.973603
[360]	valid_0's auc: 0.973993
[380]	valid_0's auc: 0.974269
[400]	valid_0's auc: 0.974604
[420]	valid_0's auc: 0.974866
[440]	valid_0's auc: 0.975028
[460]	valid_0's auc: 0.975207
[480]	valid_0's auc: 0.975368
[500]	valid_0's auc: 0.975519


In [33]:
preds = clf.predict(test[cols_to_use])

In [35]:
sub = pd.DataFrame({'ID':test['ID'], 'click':preds})
sub.to_csv('lgb_pyst_OHE_10Aug.csv', index=False)