Using lightgbm and simple date features, this script scores ~ 0.674 on Public LB.

### Load libraries and data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder



In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
print (train.shape, test.shape)

(12137810, 10) (3706907, 9)


In [72]:
# modeling on sampled (3e6) rows
rows = np.random.choice(train.index.values, int(0.3 * train.shape[0]))
train = train.loc[rows].reset_index(drop = True)

In [73]:
train.columns

Index(['ID', 'datetime', 'siteid', 'offerid', 'category', 'merchant',
       'countrycode', 'browserid', 'devid', 'click'],
      dtype='object')

In [74]:
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None", inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None", inplace=True)
test['devid'].fillna("None", inplace=True)

## Create count vars

In [75]:
train.shape

(3641343, 10)

In [76]:
merchant = train.merchant.tolist()
merchant_cnt_map = pd.Series(merchant).value_counts().to_dict()
train['merchant_cnt'] = train.merchant.map(merchant_cnt_map)
test['merchant_cnt'] = test.merchant.map(merchant_cnt_map)


siteid =  train.siteid.tolist()
siteid_cnt_map = pd.Series(siteid).value_counts().to_dict()
train['siteid_cnt'] = train.siteid.map(siteid_cnt_map)
test['siteid_cnt'] = test.siteid.map(siteid_cnt_map)


category = train.category.tolist()
category_map = pd.Series(category).value_counts().to_dict()
train['category_cnt'] = train.category.map(category_map)
test['category_cnt'] = test.category.map(category_map)


offerid = train.offerid.tolist()
offerid_cnt_map = pd.Series(offerid).value_counts().to_dict()
train['offerid_cnt'] = train.offerid.map(offerid_cnt_map)
test['offerid_cnt'] = test.offerid.map(offerid_cnt_map)
del siteid, merchant, offerid, category, siteid_cnt_map, merchant_cnt_map, category_map, offerid_cnt_map

In [77]:
merc_offer = train[['merchant','offerid','ID']].groupby(['merchant','offerid'],\
                                                        as_index = False).count().rename(columns = {'ID': 'merc_offer_cnt'})
merc_site = train[['merchant','siteid','ID']].groupby(['merchant','siteid'] , \
                                                      as_index = False).count().rename(columns = {'ID': 'merc_site_cnt'})
merc_cat = train[['merchant','category','ID']].groupby(['merchant','category'] \
                                                       ,as_index = False).count().rename(columns = {'ID': 'merc_cat_cnt'})
site_offer = train[['siteid','offerid','ID']].groupby(['siteid','offerid'],\
                                                      as_index = False).count().rename(columns = {'ID': 'site_offer_cnt'})
site_cat = train[['siteid','category','ID']].groupby(['siteid','category'] ,\
                                                     as_index = False).count().rename(columns = {'ID': 'site_cat_cnt'})
cat_offer = train[['category','offerid','ID']].groupby(['category','offerid'],\
                                                       as_index = False).count().rename(columns = {'ID': 'cat_offer_cnt'})

In [78]:
train.columns

Index(['ID', 'datetime', 'siteid', 'offerid', 'category', 'merchant',
       'countrycode', 'browserid', 'devid', 'click', 'merchant_cnt',
       'siteid_cnt', 'category_cnt', 'offerid_cnt'],
      dtype='object')

In [79]:
# merc_offer = train.groupby(['merchant','offerid']).agg({'ID' : 'count'}).reset_index()
train = train.merge(merc_offer, how = 'left')
train = train.merge(merc_site, how = 'left')
train = train.merge(merc_cat, how = 'left')
train = train.merge(site_offer, how = 'left')
train = train.merge(site_cat, how = 'left')
train = train.merge(cat_offer, how = 'left')

test = test.merge(merc_offer, how = 'left')
test = test.merge(merc_site, how = 'left')
test = test.merge(merc_cat, how = 'left')
test = test.merge(site_offer, how = 'left')
test = test.merge(site_cat, how = 'left')
test = test.merge(cat_offer, how = 'left')

## undersampling

In [80]:
train_pos = train[train.click == 1]
train_neg = train[train.click == 0]
train_neg_sample = train_neg.loc [np.random.choice(train_neg.index.values, 9 * train_pos.shape[0] ) ]
train = train_pos.append(train_neg_sample)
# del train_neg, train_neg_sample, train_pos

In [81]:
train.click.value_counts()/train.shape[0]

0    0.9
1    0.1
Name: click, dtype: float64

In [82]:
train.shape

(1309060, 20)

### Clean Data and Create Features

In [83]:
# set datatime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])
# create datetime variable
train['tweekday'] = train['datetime'].dt.weekday
train['thour'] = train['datetime'].dt.hour
train['tminute'] = train['datetime'].dt.minute

test['tweekday'] = test['datetime'].dt.weekday
test['thour'] = test['datetime'].dt.hour
test['tminute'] = test['datetime'].dt.minute

In [84]:
cols = ['siteid','offerid','category','merchant']

for x in cols:
    train[x] = train[x].astype('object')
    test[x] = test[x].astype('object')

In [85]:
cat_cols = cols + ['countrycode','browserid','devid']
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values) + list(test[col].values))
    train[col] = lbl.transform(list(train[col].values))
    test[col] = lbl.transform(list(test[col].values))

In [86]:
cols_to_transform = ['devid','browserid','countrycode']

In [87]:
train = pd.get_dummies( data = train, columns = cols_to_transform )
test = pd.get_dummies( data = test, columns = cols_to_transform )

In [88]:
print (test.shape, train.shape)

(3706907, 41) (1309060, 42)


### Model Training

In [89]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,click,merchant_cnt,siteid_cnt,category_cnt,...,browserid_8,browserid_9,browserid_10,browserid_11,countrycode_0,countrycode_1,countrycode_2,countrycode_3,countrycode_4,countrycode_5
71,IDLmaEOcR,2017-01-18 17:51:52,45197,202526,150,383,1,19744,124,33005,...,0,0,0,0,0,1,0,0,0,0
74,IDYK7BCsJ,2017-01-14 09:45:59,178650,231743,82,205,1,53200,2,5239,...,0,0,0,0,0,0,0,1,0,0
127,IDAv0IcWo,2017-01-14 12:23:06,93218,493610,32,166,1,29485,41,53305,...,0,0,0,0,0,0,1,0,0,0
142,IDdBKCKlf,2017-01-19 09:32:29,90816,165527,231,266,1,28876,34,89828,...,0,0,0,0,0,0,0,1,0,0
149,ID7sLHCVA,2017-01-10 16:12:08,12853,135566,88,127,1,56348,1,555,...,0,0,0,0,0,0,1,0,0,0


In [90]:
cols_to_use = list(set(train.columns) - set(['ID','datetime','click']))

In [91]:
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.5
params['silent'] = 1
params['eval_metric'] = 'auc'

In [92]:
train.columns

Index(['ID', 'datetime', 'siteid', 'offerid', 'category', 'merchant', 'click',
       'merchant_cnt', 'siteid_cnt', 'category_cnt', 'offerid_cnt',
       'merc_offer_cnt', 'merc_site_cnt', 'merc_cat_cnt', 'site_offer_cnt',
       'site_cat_cnt', 'cat_offer_cnt', 'tweekday', 'thour', 'tminute',
       'devid_0', 'devid_1', 'devid_2', 'devid_3', 'browserid_0',
       'browserid_1', 'browserid_2', 'browserid_3', 'browserid_4',
       'browserid_5', 'browserid_6', 'browserid_7', 'browserid_8',
       'browserid_9', 'browserid_10', 'browserid_11', 'countrycode_0',
       'countrycode_1', 'countrycode_2', 'countrycode_3', 'countrycode_4',
       'countrycode_5'],
      dtype='object')

In [93]:
test.columns

Index(['ID', 'datetime', 'siteid', 'offerid', 'category', 'merchant',
       'merchant_cnt', 'siteid_cnt', 'category_cnt', 'offerid_cnt',
       'merc_offer_cnt', 'merc_site_cnt', 'merc_cat_cnt', 'site_offer_cnt',
       'site_cat_cnt', 'cat_offer_cnt', 'tweekday', 'thour', 'tminute',
       'devid_0', 'devid_1', 'devid_2', 'devid_3', 'browserid_0',
       'browserid_1', 'browserid_2', 'browserid_3', 'browserid_4',
       'browserid_5', 'browserid_6', 'browserid_7', 'browserid_8',
       'browserid_9', 'browserid_10', 'browserid_11', 'countrycode_0',
       'countrycode_1', 'countrycode_2', 'countrycode_3', 'countrycode_4',
       'countrycode_5'],
      dtype='object')

In [94]:
X, y = train[cols_to_use], train['click']
dtrain = xgb.DMatrix(X, label= y)
dtest = xgb.DMatrix(test[cols_to_use])
ids = test['ID']

In [95]:
num_rounds = 600
cv_output = xgb.cv(params, dtrain, num_boost_round=num_rounds, early_stopping_rounds=10, verbose_eval=20, show_stdv=False)
# [580]	train-auc:0.993018	test-auc:0.987913
# [300]	train-auc:0.99624	test-auc:0.990564
# [580]	train-auc:0.998785	test-auc:0.994601
# [580]	train-auc:0.990572	test-auc:0.977709

[0]	train-auc:0.968371	test-auc:0.968252
[20]	train-auc:0.987224	test-auc:0.986157
[40]	train-auc:0.989037	test-auc:0.987119
[60]	train-auc:0.990273	test-auc:0.987669
[80]	train-auc:0.991243	test-auc:0.988022
[100]	train-auc:0.99213	test-auc:0.988369
[120]	train-auc:0.992818	test-auc:0.988614
[140]	train-auc:0.993425	test-auc:0.988784
[160]	train-auc:0.994042	test-auc:0.988931
[180]	train-auc:0.994588	test-auc:0.989002
[200]	train-auc:0.99506	test-auc:0.989084
[220]	train-auc:0.995471	test-auc:0.989175
[240]	train-auc:0.995853	test-auc:0.989265
[260]	train-auc:0.996223	test-auc:0.989359
[280]	train-auc:0.99654	test-auc:0.989433
[300]	train-auc:0.996821	test-auc:0.989485
[320]	train-auc:0.997103	test-auc:0.98952
[340]	train-auc:0.997362	test-auc:0.989547
[360]	train-auc:0.997601	test-auc:0.989561
[380]	train-auc:0.997802	test-auc:0.989598
[400]	train-auc:0.998021	test-auc:0.989626


In [96]:
len(cv_output)

403

In [97]:
num_rounds = len(cv_output)
watchlist = [(dtrain ,'dtrain')]
clf_xgb_main = xgb.train(dtrain=dtrain, params=params, num_boost_round=num_rounds, evals=watchlist, verbose_eval=100)

[0]	dtrain-auc:0.967991
[100]	dtrain-auc:0.991429
[200]	dtrain-auc:0.994099
[300]	dtrain-auc:0.995734
[400]	dtrain-auc:0.99696


In [98]:
preds = clf_xgb_main.predict(dtest)

In [None]:
# clf = lgb.train(params, dtrain,num_boost_round=500,valid_sets=dval,verbose_eval=20)

In [None]:
# preds = clf.predict(test[cols_to_use])

In [99]:
sub = pd.DataFrame({'ID':ids, 'click':preds})
sub.to_csv('xgb5_10pos_final.csv', index=False)

In [None]:
len(preds)