In [1]:
import pandas as pd
import numpy as np
import numpy as np
import gc # garbage collector
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import lightgbm as lgb
from sklearn.cross_validation import train_test_split



In [2]:
# read the training data
test_data = pd.read_csv('test.csv')
test_data.shape
test_data.sample(5)

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
6613266,6613266,96021,21,1,13,232,2017-11-10 09:07:53
12347028,12347029,75403,12,1,6,178,2017-11-10 13:00:27
10142197,10142197,75936,12,1,19,265,2017-11-10 10:18:34
12485268,12485268,53454,9,2,13,215,2017-11-10 13:03:09
7880214,7880214,73875,15,1,13,140,2017-11-10 09:33:50


In [3]:
# read the training data
train_data = pd.read_csv('train.csv', nrows=10000000)
train_data.head()


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0


In [4]:
# check the data column dtype
train_data['click_time'].head()

0    2017-11-06 14:32:21
1    2017-11-06 14:33:34
2    2017-11-06 14:34:12
3    2017-11-06 14:34:52
4    2017-11-06 14:35:08
Name: click_time, dtype: object

The data data type is object, we need to change the column to datetime. We also want to gete the specific day and hour of the click time.

In [5]:
def change_date(df, time):
    r_time = pd.to_datetime(time)
    df['hour'] = pd.to_datetime(r_time).dt.hour.astype('uint8')
    df['day'] = pd.to_datetime(r_time).dt.day.astype('uint8')
    return df

In [6]:
# datetime on test.data
change_date(test_data, test_data.click_time)
test_data.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,hour,day
0,0,5744,9,1,3,107,2017-11-10 04:00:00,4,10
1,1,119901,9,1,3,466,2017-11-10 04:00:00,4,10
2,2,72287,21,1,19,128,2017-11-10 04:00:00,4,10
3,3,78477,15,1,13,111,2017-11-10 04:00:00,4,10
4,4,123080,12,1,13,328,2017-11-10 04:00:00,4,10


In [7]:
# datetime on train_data
change_date(train_data, train_data.click_time)
train_data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day
0,83230,3,1,13,379,2017-11-06 14:32:21,,0,14,6
1,17357,3,1,19,379,2017-11-06 14:33:34,,0,14,6
2,35810,3,1,13,379,2017-11-06 14:34:12,,0,14,6
3,45745,14,1,13,478,2017-11-06 14:34:52,,0,14,6
4,161007,3,1,13,379,2017-11-06 14:35:08,,0,14,6


###### Analyse the label(is_attributed) by the features

In [8]:
def group(dataset, g, x):
    gp = dataset.groupby(by=g)[['channel']].count().reset_index().rename(index=str, columns={'channel': x})
    dataset = dataset.merge(gp, on=g, how='left')
    return dataset

In [9]:
# ip-click_time_combination
x = 'ip_tcount'
g = ['ip','day','hour']
test_data = group(test_data, g, x )
train_data = group(train_data, g, x)

In [10]:
# ip-app_os combination
x = 'ip_app_os_count'
g = ['ip','app','os']
test_data = group(test_data, g, x )
train_data = group(train_data, g, x)

In [11]:
# ip-device_os combination
x = 'ip_device_os_count'
g = ['ip','device','os']
test_data = group(test_data, g, x )
train_data = group(train_data, g, x)

In [12]:
# ip_is_attributed, device combination
x = 'ip_is_attributed_count'
g = ['ip', 'app', 'device', 'os']
test_data = group(test_data, g, x)
train_data = group(train_data, g, x)

In [13]:
# split train_data into training and validation
train_data, validation_data = train_test_split(train_data,  train_size=0.60, random_state=0)
print(train_data.shape)
print(validation_data.shape)


(6000000, 14)
(4000000, 14)


In [14]:
# convert the dataset into LGBM format in this fromat:
# lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])
y = 'is_attributed'
x = ['app','device','os', 'channel', 'hour', 'day', 
              'ip_tcount', 'ip_app_os_count',
              'ip_device_os_count', 'ip_is_attributed_count']
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']

l_train = lgb.Dataset(train_data[x].values, 
                    label=train_data[y].values, 
                    feature_name=x,
                    categorical_feature=categorical
                        )

l_validation = lgb.Dataset(validation_data[x].values, 
                    label=validation_data[y].values, 
                    feature_name=x,
                    categorical_feature=categorical
                        )


In [15]:
# LGBM parameters
parameters = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric':'auc',
    'learning_rate': 0.1,
    'num_leaves': 150,
    'max_depth':2,  
    'min_data_in_leaf': 1500,
    'min_child_samples': 100,
    'max_bin': 100,  
    'subsample': 0.7,  
#     'subsample_freq': 2,
    'colsample_bytree': 0.7,  
    'min_child_weight': 0,  
    'min_split_gain': 0,
    'reg_alpha': 0, 
    'reg_lambda': 0,
}


In [16]:
train_data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day,ip_tcount,ip_app_os_count,ip_device_os_count,ip_is_attributed_count
4994731,188186,12,1,13,245,2017-11-06 20:27:11,,0,20,6,68,6,27,6
670940,114276,3,1,13,137,2017-11-06 16:13:56,,0,16,6,6467,656,4662,623
20167,169710,8,1,13,140,2017-11-06 16:00:26,,0,16,6,15,1,21,1
7175328,146001,3,1,19,409,2017-11-06 23:07:20,,0,23,6,1458,138,1282,135
2340969,105910,19,0,0,213,2017-11-06 17:00:59,,0,17,6,516,745,743,741


In [17]:
# train the model
model = lgb.train(parameters,
                  l_train,
                  valid_sets=[l_train, l_validation],
                  valid_names=['train','valid'],
                  early_stopping_rounds=100,
                  num_boost_round=300,
                  verbose_eval=10,
                  feval=None,
                  )
trained_model = model.best_iteration



Training until validation scores don't improve for 100 rounds.
[10]	train's auc: 0.88975	valid's auc: 0.884114
[20]	train's auc: 0.907618	valid's auc: 0.901295
[30]	train's auc: 0.909344	valid's auc: 0.902205
[40]	train's auc: 0.930105	valid's auc: 0.922843
[50]	train's auc: 0.931301	valid's auc: 0.924078
[60]	train's auc: 0.948837	valid's auc: 0.942642
[70]	train's auc: 0.956313	valid's auc: 0.950062
[80]	train's auc: 0.960053	valid's auc: 0.954996
[90]	train's auc: 0.962434	valid's auc: 0.957636
[100]	train's auc: 0.964273	valid's auc: 0.959413
[110]	train's auc: 0.965324	valid's auc: 0.960225
[120]	train's auc: 0.965869	valid's auc: 0.960817
[130]	train's auc: 0.966325	valid's auc: 0.961088
[140]	train's auc: 0.96708	valid's auc: 0.961808
[150]	train's auc: 0.967465	valid's auc: 0.962044
[160]	train's auc: 0.96789	valid's auc: 0.962339
[170]	train's auc: 0.968102	valid's auc: 0.962531
[180]	train's auc: 0.968498	valid's auc: 0.962996
[190]	train's auc: 0.968962	valid's auc: 0.963286

In [18]:
# predict test data
test_pred = pd.DataFrame()
test_pred['click_id'] = test_data['click_id'].astype('int')
test_pred.head()

test_pred['is_attributed'] = model.predict(test_data[x])
test_pred.to_csv('submission_fraud.csv', index=False)