In [8]:
import os
import gc
import time
from time import gmtime, strftime

from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import pandas as pd
%matplotlib inline

# Features

In [18]:
def group_label(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(group_cols)
        print(i, col_name)
        group_idx = df.drop_duplicates(cols)[cols].reset_index()
        group_idx.rename(columns={'index':col_name}, inplace=True)
        df = df.merge( group_idx, on=cols, how='left' )
        del group_idx
        gc.collect()
    return df
    
def count_agg(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(cols)+'_count'
        print(i, col_name)
        count = df.groupby(cols).size().reset_index(name=col_name)
        df = df.merge(count, on=cols, how='left')
        del count
        gc.collect()
    return df

def count_cum(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(cols)+'_countAccum'
        print(i, col_name)
        df[col_name] = df.groupby(cols).cumcount()
        gc.collect()
    return df

def count_uniq(df, group_uniq_cols):
    for i, cols in enumerate(group_uniq_cols):
        group_cols, uniq_col = cols[0], cols[1]
        col_name = "_".join(group_cols)+'_uniq_'+uniq_col+'_countUniq'
        print(i, col_name)
        tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name)
        df = df.merge(tmp, on=group_cols, how='left')
        del tmp
        gc.collect()
    return df

def next_click(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(cols)+'_nextClick'
        print(i, col_name)
        df[col_name] = (df.groupby(cols).click_time.shift(-1) - df.click_time).astype(np.float32)
        gc.collect()
    return df

def frequence(df, group_cols):
    for i, cols in enumerate(group_cols):
        col_name = "_".join(cols)+'_nextClick'
        print(i, col_name)
        clickFreq = df.groupby(cols)[col_name].mean().dropna().reset_index(name=("_".join(cols)+'_clickFreq'))
        df = df.merge(clickFreq, on=cols, how='left')
        del clickFreq
        gc.collect()
    return df

In [19]:
def generate_features(df):
    print('generating time features...')
    df['day'] = df['click_time'].dt.day.astype('uint8')
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    df['in_test_hh'] = (3 - 2 * df['hour'].isin([4, 5, 9, 10, 13, 14]) # most frequent
                          - 1 * df['hour'].isin([6, 11, 15])).astype('uint8') # least frequent
    print('done')
    gc.collect()
    
    group_combinations = [
        #['app', 'device'],
        #['app', 'channel']
    ]
    
    count_combinations = [
        ['app'],
        ['ip'], # 3.03
        ['channel'],
        ['os'],
        ['ip', 'device'], # 9.88
        ['day', 'hour', 'app'], # 4.08
        ['app', 'channel'], # 2.8
        ['ip', 'day', 'in_test_hh'], # 1.74
        ['ip', 'day', 'hour'], # 0.52
        ['os', 'device'], # 0.44
        ['ip', 'os', 'day', 'hour'], # 0.41
        ['ip', 'device', 'day', 'hour'], # 0.31
        ['ip', 'app', 'os'] # 0.21
    ]
    
    countUniq_combinations = [
        #[['app'],'ip'],
        #[['app', 'device', 'os', 'channel'], 'ip'],
        [['ip'], 'channel'], # 0.9
        [['ip'], 'app'], # 1.3
        [['ip'], 'os'] # 0.45
    ]
    
    nextClick_combinations = [
        ['ip', 'os'],
        ['ip', 'device', 'os'],
        ['ip', 'app', 'device', 'os'],
        ['ip', 'app', 'device', 'os', 'channel']
    ]
    
    freq_combinations = [
        #['ip', 'app', 'device', 'os']
    ]
    
    accum_combinations = [
        #['app'],
        ['ip'] # 3.03
        #['day', 'hour', 'app']
    ]
    
    
    df = group_label(df, group_combinations)
    df = count_agg(df, count_combinations)
    df = count_cum(df, accum_combinations)
    df = count_uniq(df, countUniq_combinations)
    df['click_time'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
    df = next_click(df, nextClick_combinations)
    df = frequence(df, freq_combinations)
    
    df.drop(['ip', 'click_time', 'day', 'in_test_hh'], axis=1, inplace=True)
    gc.collect()
    print(df.info())
    return df

# Load Data

In [20]:
dtype = {
    'ip' :'uint32',
    'app' :'uint16',
    'device': 'uint16',
    'os' :'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32',
}

# train: (184903890, 7)
# test: (18790469, 7)
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
train_df = pd.read_csv('data/train.csv', dtype=dtype, usecols=train_cols, parse_dates=['click_time'])

test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
# using test_supplement 
test_df = pd.read_csv('data/test_supplement.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

In [21]:
# combine train and test data
common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_df = pd.concat([train_df[common_cols], test_df[common_cols]])

In [22]:
# generate data
all_df = generate_features(all_df)

generating time features...
done
0 app_count
1 ip_count
2 channel_count
3 os_count
4 ip_device_count
5 day_hour_app_count
6 app_channel_count
7 ip_day_in_test_hh_count
8 ip_day_hour_count
9 os_device_count
10 ip_os_day_hour_count
11 ip_device_day_hour_count
12 ip_app_os_count
0 ip_countAccum
0 ip_uniq_channel_countUniq
1 ip_uniq_app_countUniq
2 ip_uniq_os_countUniq
0 ip_os_nextClick
1 ip_device_os_nextClick
2 ip_app_device_os_nextClick
3 ip_app_device_os_channel_nextClick
<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 26 columns):
app                                   200000 non-null uint16
device                                200000 non-null uint16
os                                    200000 non-null uint16
channel                               200000 non-null uint16
hour                                  200000 non-null uint8
app_count                             200000 non-null int64
ip_count                              200000 non

In [10]:
# split train/test features from concated data
train_features = all_df.iloc[:train_df.shape[0]]
test_features = all_df.iloc[train_df.shape[0]:]
gc.collect()

24

# Train LightGBM Model

In [11]:
metrics = 'auc'
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': metrics,
    'learning_rate': 0.1,
    'num_leaves': 7,
    'max_depth': 4,
    'min_child_samples': 100,
    'max_bin': 100,
    'subsample': 0.7,
    'subsample_freq': 1,
    'colsample_bytree': 0.7,
    'min_child_weight': 0,
    'min_split_gain': 0,
    'nthread': 24,
    'verbose': 1,
    'scale_pos_weight': 200
}

target = 'is_attributed'
features = [col for col in train_features.columns if col not in ['level_0', 'index', 'is_attributed']]
category = ['app', 'device', 'os', 'channel', 'hour']

In [12]:
# train valid split
labels = train_df.is_attributed.values
train_features, valid_features = train_test_split(train_features, test_size=5000000, shuffle=False)
train_labels, valid_labels = train_test_split(labels, test_size=5000000, shuffle=False)
print('Train size:', len(train_features))
print('Valid size:', len(valid_features))
gc.collect()

Train size: 179903890
Valid size: 5000000


12

In [13]:
# convert data into dataset. Warning: Memory Peak
xgtrain = lgb.Dataset(train_features[features].values, 
                      label=train_labels,
                      feature_name=features,
                      categorical_feature=category)

In [14]:
xgvalid = lgb.Dataset(valid_features[features].values, 
                      label=valid_labels,
                      feature_name=features,
                      categorical_feature=category)

In [15]:
print('Training...')
evals_results = {}
model = lgb.train(lgb_params,
                  xgtrain,
                  valid_sets=[xgvalid],
                  valid_names=['valid'],
                  evals_result=evals_results,
                  num_boost_round=5000,
                  early_stopping_rounds=100,
                  verbose_eval=1,
                  feval=None)
n_estimators = model.best_iteration

print('\nModel Info:')
print('n_estimators:', n_estimators)
print(metrics + ':', evals_results['valid'][metrics][n_estimators - 1])

gain = model.feature_importance('gain')
ft = pd.DataFrame({'feature': model.feature_name(), 'split': model.feature_importance('split'),
                   'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
ft.to_csv('feature_importance_ref.csv', index=False)
print(ft)

model_name = 'model-%s' % strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model.save_model(model_name)
print('model saved as %s' % model_name)

Training...




[1]	valid's auc: 0.966352
Training until validation scores don't improve for 50 rounds.
[2]	valid's auc: 0.965255
[3]	valid's auc: 0.965736
[4]	valid's auc: 0.967262
[5]	valid's auc: 0.968275
[6]	valid's auc: 0.968514
[7]	valid's auc: 0.969005
[8]	valid's auc: 0.969135
[9]	valid's auc: 0.969544
[10]	valid's auc: 0.969897
[11]	valid's auc: 0.971303
[12]	valid's auc: 0.970808
[13]	valid's auc: 0.972213
[14]	valid's auc: 0.972248
[15]	valid's auc: 0.972585
[16]	valid's auc: 0.973142
[17]	valid's auc: 0.972927
[18]	valid's auc: 0.973638
[19]	valid's auc: 0.973712
[20]	valid's auc: 0.973566
[21]	valid's auc: 0.97354
[22]	valid's auc: 0.974188
[23]	valid's auc: 0.974211
[24]	valid's auc: 0.974127
[25]	valid's auc: 0.975213
[26]	valid's auc: 0.975339
[27]	valid's auc: 0.975524
[28]	valid's auc: 0.975912
[29]	valid's auc: 0.976048
[30]	valid's auc: 0.976783
[31]	valid's auc: 0.977161
[32]	valid's auc: 0.977056
[33]	valid's auc: 0.977343
[34]	valid's auc: 0.977964
[35]	valid's auc: 0.978315
[36

[297]	valid's auc: 0.989073
[298]	valid's auc: 0.989077
[299]	valid's auc: 0.989073
[300]	valid's auc: 0.989086
[301]	valid's auc: 0.989088
[302]	valid's auc: 0.989092
[303]	valid's auc: 0.989092
[304]	valid's auc: 0.989096
[305]	valid's auc: 0.989102
[306]	valid's auc: 0.989102
[307]	valid's auc: 0.989111
[308]	valid's auc: 0.989113
[309]	valid's auc: 0.989125
[310]	valid's auc: 0.989129
[311]	valid's auc: 0.98913
[312]	valid's auc: 0.989132
[313]	valid's auc: 0.989128
[314]	valid's auc: 0.989138
[315]	valid's auc: 0.989141
[316]	valid's auc: 0.989145
[317]	valid's auc: 0.98915
[318]	valid's auc: 0.989162
[319]	valid's auc: 0.989167
[320]	valid's auc: 0.989169
[321]	valid's auc: 0.989171
[322]	valid's auc: 0.989168
[323]	valid's auc: 0.989171
[324]	valid's auc: 0.989172
[325]	valid's auc: 0.989175
[326]	valid's auc: 0.989175
[327]	valid's auc: 0.989182
[328]	valid's auc: 0.989182
[329]	valid's auc: 0.989193
[330]	valid's auc: 0.989196
[331]	valid's auc: 0.989205
[332]	valid's auc: 0.9

[591]	valid's auc: 0.989638
[592]	valid's auc: 0.989641
[593]	valid's auc: 0.989639
[594]	valid's auc: 0.989639
[595]	valid's auc: 0.98964
[596]	valid's auc: 0.989638
[597]	valid's auc: 0.989647
[598]	valid's auc: 0.989646
[599]	valid's auc: 0.989645
[600]	valid's auc: 0.989646
[601]	valid's auc: 0.989649
[602]	valid's auc: 0.989652
[603]	valid's auc: 0.989653
[604]	valid's auc: 0.98965
[605]	valid's auc: 0.989655
[606]	valid's auc: 0.989659
[607]	valid's auc: 0.989662
[608]	valid's auc: 0.989662
[609]	valid's auc: 0.989662
[610]	valid's auc: 0.989663
[611]	valid's auc: 0.989662
[612]	valid's auc: 0.989663
[613]	valid's auc: 0.989663
[614]	valid's auc: 0.989661
[615]	valid's auc: 0.989661
[616]	valid's auc: 0.989662
[617]	valid's auc: 0.989663
[618]	valid's auc: 0.989666
[619]	valid's auc: 0.989665
[620]	valid's auc: 0.989668
[621]	valid's auc: 0.989667
[622]	valid's auc: 0.989664
[623]	valid's auc: 0.989668
[624]	valid's auc: 0.989669
[625]	valid's auc: 0.989669
[626]	valid's auc: 0.9

[885]	valid's auc: 0.989849
[886]	valid's auc: 0.989848
[887]	valid's auc: 0.989852
[888]	valid's auc: 0.989852
[889]	valid's auc: 0.989847
[890]	valid's auc: 0.989848
[891]	valid's auc: 0.989848
[892]	valid's auc: 0.989849
[893]	valid's auc: 0.989848
[894]	valid's auc: 0.989847
[895]	valid's auc: 0.989849
[896]	valid's auc: 0.98985
[897]	valid's auc: 0.98985
[898]	valid's auc: 0.989845
[899]	valid's auc: 0.989845
[900]	valid's auc: 0.989844
[901]	valid's auc: 0.989843
[902]	valid's auc: 0.989841
[903]	valid's auc: 0.989836
[904]	valid's auc: 0.989837
[905]	valid's auc: 0.989838
[906]	valid's auc: 0.989836
[907]	valid's auc: 0.989837
[908]	valid's auc: 0.989834
[909]	valid's auc: 0.989834
[910]	valid's auc: 0.989831
[911]	valid's auc: 0.989831
[912]	valid's auc: 0.989832
[913]	valid's auc: 0.989832
[914]	valid's auc: 0.989834
[915]	valid's auc: 0.989835
[916]	valid's auc: 0.989836
[917]	valid's auc: 0.989837
[918]	valid's auc: 0.989838
[919]	valid's auc: 0.989841
[920]	valid's auc: 0.9

# LGB Prediction

In [16]:
print('Predicting...')
test_df['is_attributed'] = model.predict(test_features[features], num_iteration=model.best_iteration)

Predicting...


In [17]:
print('loading test')
test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

loading test


In [18]:
print('merging test_supplement to test')
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']

test = test.merge(test_df[all_cols], how='left', on=join_cols)
test = test.drop_duplicates(subset=['click_id'])

print("Writing the submission data into a csv file...")
test[['click_id', 'is_attributed']].to_csv('submit_lgb_875.gz', index=False, float_format='%.9f', compression='gzip')
print("All done...")

merging test_supplement to test
Writing the submission data into a csv file...
All done...


In [None]:
del test
gc.collect()

# Train XGBoost

In [24]:
xgb_params = {'eta': 0.1,
              'tree_method': "hist",
              'grow_policy': "lossguide",
              'max_leaves': 1400,  
              'max_depth': 4, 
              'subsample': 0.9, 
              'colsample_bytree': 0.7, 
              'colsample_bylevel':0.7,
              'min_child_weight':0,
              'alpha':0,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc',
              'nthread':24,
              'random_state': 42,
              'scale_pos_weight':200,
              'silent': True}

In [None]:
# train valid split
labels = train_df.is_attributed.values
train_features, valid_features = train_test_split(train_features, train_size=.95, shuffle=False)
train_labels, valid_labels = train_test_split(labels, train_size=.95, shuffle=False)
print('Train size:', len(train_features))
print('Valid size:', len(valid_features))
gc.collect()

In [25]:
dtrain = xgb.DMatrix(train_features, train_labels)
dvalid = xgb.DMatrix(valid_features, valid_labels)
watchlist = [(dvalid, 'valid')]

In [None]:
xgb_model = xgb.train(xgb_params, 
                      dtrain, 
                      num_boost_round=2000, 
                      evals=watchlist, 
                      maximize=True, 
                      early_stopping_rounds = 50, 
                      verbose_eval=5)

[0]	valid-auc:0.964034
Will train until valid-auc hasn't improved in 50 rounds.
[5]	valid-auc:0.971544
[10]	valid-auc:0.972256
[15]	valid-auc:0.973251
[20]	valid-auc:0.974015
[25]	valid-auc:0.975026
[30]	valid-auc:0.976547
[35]	valid-auc:0.97817
[40]	valid-auc:0.979116
[45]	valid-auc:0.980232
[50]	valid-auc:0.981004
[55]	valid-auc:0.981727
[60]	valid-auc:0.982595
[65]	valid-auc:0.98306
[70]	valid-auc:0.983537
[75]	valid-auc:0.984043
[80]	valid-auc:0.984369
[85]	valid-auc:0.984825
[90]	valid-auc:0.985054
[95]	valid-auc:0.985282
[100]	valid-auc:0.985492
[105]	valid-auc:0.985699
[110]	valid-auc:0.985984
[115]	valid-auc:0.986107
[120]	valid-auc:0.986308
[125]	valid-auc:0.986371
[130]	valid-auc:0.98647
[135]	valid-auc:0.986554
[140]	valid-auc:0.986716
[145]	valid-auc:0.98681
[150]	valid-auc:0.986853
[155]	valid-auc:0.986941
[160]	valid-auc:0.987068
[165]	valid-auc:0.987109
[170]	valid-auc:0.987232
[175]	valid-auc:0.987289
[180]	valid-auc:0.9876
[185]	valid-auc:0.987754
[190]	valid-auc:0.987

In [None]:
xgb.plot_importance(xgb_model)

In [None]:
import operator
sorted(xgb_model.get_fscore().items(), key=operator.itemgetter(1), reverse=True)

# XGB Prediction

In [None]:
model_name = 'xgb-model-%s' % strftime("%Y-%m-%d-%H-%M-%S", gmtime())
xgb_model.save_model(model_name)
print('model saved as %s' % model_name)

In [None]:
dtest = xgb.DMatrix(test_features)

In [None]:
print('Predicting...')
test_df['is_attributed'] = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)

In [None]:
print('loading test')
test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

In [None]:
print('merging test_supplement to test')
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']

test = test.merge(test_df[all_cols], how='left', on=join_cols)
test = test.drop_duplicates(subset=['click_id'])

print("Writing the submission data into a csv file...")
test[['click_id', 'is_attributed']].to_csv('submit_xgb_%s.gz'%xgb_model.best_ntree_limit, index=False, float_format='%.9f', compression='gzip')
print("All done...")