In [10]:
import os
import gc
import time
from time import gmtime, strftime

from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
import pandas as pd

# Features

In [2]:
def count_agg(df, group_cols):
    col_name = "_".join(group_cols)+'_count'
    count = df.groupby(group_cols).size().reset_index(name=col_name)
    df = df.merge(count, on=group_cols, how='left')
    del count
    gc.collect()
    return df

def count_cum(df, group_cols):
    col_name = "_".join(group_cols)+'_countAccum'
    df[col_name] = df.groupby(group_cols).cumcount()
    gc.collect()
    return df

def count_uniq(df, group_cols, uniq_col):
    col_name = "_".join(group_cols)+'_uniq_'+uniq_col+'_countUniq'
    tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name)
    df = df.merge(tmp, on=group_cols, how='left')
    del tmp
    gc.collect()
    return df

def next_click(df, group_cols):
    df["_".join(group_cols)+'_nextClick'] = (df.groupby(group_cols).click_time.shift(-1) - df.click_time).astype(np.float32)
    gc.collect()
    return df

In [3]:
def generate_features(df):
    print('generating time features...')
    df['day'] = df['click_time'].dt.day.astype('uint8')
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    df['in_test_hh'] = (3
                        - 2 * df['hour'].isin([4, 5, 9, 10, 13, 14]) # most frequent
                        - 1 * df['hour'].isin([6, 11, 15])).astype('uint8') # least frequent
    print('done')
    gc.collect()
    
    count_combinations = [
        ['app'],
        ['ip', 'device'], # 9.88
        ['day', 'hour', 'app'], # 4.08
        ['ip'], # 3.03
        ['app', 'channel'], # 2.8
        ['ip', 'day', 'in_test_hh'], # 1.74
        ['os', 'app', 'channel'], # 0.72
        ['ip', 'day', 'hour'], # 0.52
        ['os', 'device'], # 0.44
        ['ip', 'os', 'day', 'hour'], # 0.41
        ['ip', 'app', 'day', 'hour'], # 0.28
        ['ip', 'device', 'day', 'hour'], # 0.31
        ['ip', 'app', 'os'] # 0.21
    ]
    
    countUniq_combinations = [
        [['ip'], 'channel'], # 0.9
        [['ip'], 'app'], # 1.3
        [['ip'], 'os'] # 0.45
    ]
    
    nextClick_combinations = [
        ['ip', 'app', 'device', 'os'],
        ['ip', 'app', 'device', 'os', 'channel'],
        ['ip', 'os', 'device'],
        ['ip', 'os', 'device', 'app']
    ]

    # count features
    for i, cols in enumerate(count_combinations):
        print(i, cols)
        df = count_agg(df, cols)

    # count unique features
    for i, cols in enumerate(countUniq_combinations):
        print(i, cols)
        df = count_uniq(df, cols[0], cols[1])
    
    # next click features
    df['click_time'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
    for i, cols in enumerate(nextClick_combinations):
        print(i, cols)
        df = next_click(df, cols)
    
    df.drop(['ip', 'click_time', 'day', 'in_test_hh'], axis=1, inplace=True)
    gc.collect()
    print(df.info())
    return df

# Load Data

In [4]:
dtype = {
    'ip' :'uint32',
    'app' :'uint16',
    'device': 'uint16',
    'os' :'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32',
}

# train: (184903890, 7)
# test: (18790469, 7)
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
train_df = pd.read_csv('data/train.csv', skiprows=range(1, 84903891), nrows=100000000,
                       dtype=dtype, usecols=train_cols, parse_dates=['click_time'])

test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
# using test_supplement 
test_df = pd.read_csv('data/test_supplement.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

In [5]:
# combine train and test data
common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_df = pd.concat([train_df[common_cols], test_df[common_cols]])

In [6]:
# generate data
all_df = generate_features(all_df)

generating time features...
done
0 ['app']
1 ['ip', 'device']
2 ['day', 'hour', 'app']
3 ['ip']
4 ['app', 'channel']
5 ['ip', 'day', 'in_test_hh']
6 ['os', 'app', 'channel']
7 ['ip', 'day', 'hour']
8 ['os', 'device']
9 ['ip', 'os', 'day', 'hour']
10 ['ip', 'app', 'day', 'hour']
11 ['ip', 'device', 'day', 'hour']
12 ['ip', 'app', 'os']
0 [['ip'], 'channel']
1 [['ip'], 'app']
2 [['ip'], 'os']
0 ['ip', 'app', 'device', 'os']
1 ['ip', 'app', 'device', 'os', 'channel']
2 ['ip', 'os', 'device']
3 ['ip', 'os', 'device', 'app']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 157537505 entries, 0 to 157537504
Data columns (total 25 columns):
app                                   uint16
device                                uint16
os                                    uint16
channel                               uint16
hour                                  uint8
app_count                             int64
ip_device_count                       int64
day_hour_app_count                    int64
i

In [14]:
# split train/test features from concated data
train_features = all_df.iloc[:train_df.shape[0]]
test_features = all_df.iloc[train_df.shape[0]:]
gc.collect()

0

# Train LightGBM Model

In [16]:
metrics = 'auc'
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': metrics,
    'learning_rate': 0.1,
    'num_leaves': 7,
    'max_depth': 4,
    'min_child_samples': 100,
    'max_bin': 100,
    'subsample': 0.7,
    'subsample_freq': 1,
    'colsample_bytree': 0.7,
    'min_child_weight': 0,
    'min_split_gain': 0,
    'nthread': 4,
    'verbose': 1,
    'scale_pos_weight': 100
}

target = 'is_attributed'
features = [col for col in train_features.columns if col not in ['is_attributed']]
category = ['app', 'device', 'os', 'channel', 'hour']

In [15]:
# train valid split
labels = train_df.is_attributed.values
train_features, valid_features = train_test_split(train_features, train_size=.95, shuffle=False)
train_labels, valid_labels = train_test_split(labels, train_size=.95, shuffle=False)
print('Train size:', len(train_features))
print('Valid size:', len(valid_features))
gc.collect()



Train size: 95000000
Valid size: 5000000


12

In [17]:
# convert data into dataset
xgtrain = lgb.Dataset(train_features[features].values, 
                      label=train_labels,
                      feature_name=features,
                      categorical_feature=category)

13

In [18]:
xgvalid = lgb.Dataset(valid_features[features].values, 
                      label=valid_labels,
                      feature_name=features,
                      categorical_feature=category)

22

In [None]:
print('Training...')
evals_results = {}
model = lgb.train(lgb_params,
                  xgtrain,
                  valid_sets=[xgvalid],
                  valid_names=['valid'],
                  evals_result=evals_results,
                  num_boost_round=1000,
                  early_stopping_rounds=50,
                  verbose_eval=1,
                  feval=None)
n_estimators = model.best_iteration

print('\nModel Info:')
print('n_estimators:', n_estimators)
print(metrics + ':', evals_results['valid'][metrics][n_estimators - 1])

gain = model.feature_importance('gain')
ft = pd.DataFrame({'feature': model.feature_name(), 'split': model.feature_importance('split'),
                   'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
ft.to_csv('feature_importance_ref.csv', index=False)
print(ft)

model_name = 'model-%s' % strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model.save_model(model_name)
print('model saved as %s' % model_name)

Training...




[1]	valid's auc: 0.96549
Training until validation scores don't improve for 50 rounds.
[2]	valid's auc: 0.967828
[3]	valid's auc: 0.968225
[4]	valid's auc: 0.968619
[5]	valid's auc: 0.968648
[6]	valid's auc: 0.96881
[7]	valid's auc: 0.969037
[8]	valid's auc: 0.970435
[9]	valid's auc: 0.970981
[10]	valid's auc: 0.971074
[11]	valid's auc: 0.972817
[12]	valid's auc: 0.974043
[13]	valid's auc: 0.974107
[14]	valid's auc: 0.975028
[15]	valid's auc: 0.975738
[16]	valid's auc: 0.975824
[17]	valid's auc: 0.9754
[18]	valid's auc: 0.976584
[19]	valid's auc: 0.975984
[20]	valid's auc: 0.975938
[21]	valid's auc: 0.975698
[22]	valid's auc: 0.975901
[23]	valid's auc: 0.976262
[24]	valid's auc: 0.97629
[25]	valid's auc: 0.97638
[26]	valid's auc: 0.976402
[27]	valid's auc: 0.976581
[28]	valid's auc: 0.976882
[29]	valid's auc: 0.977113
[30]	valid's auc: 0.977184
[31]	valid's auc: 0.97735
[32]	valid's auc: 0.977634
[33]	valid's auc: 0.978189
[34]	valid's auc: 0.97843
[35]	valid's auc: 0.978825
[36]	valid

[297]	valid's auc: 0.988954
[298]	valid's auc: 0.988951
[299]	valid's auc: 0.988954
[300]	valid's auc: 0.988963
[301]	valid's auc: 0.988956
[302]	valid's auc: 0.988963
[303]	valid's auc: 0.988967
[304]	valid's auc: 0.988967
[305]	valid's auc: 0.988975
[306]	valid's auc: 0.988982
[307]	valid's auc: 0.988983
[308]	valid's auc: 0.988985
[309]	valid's auc: 0.988986
[310]	valid's auc: 0.988985
[311]	valid's auc: 0.988983
[312]	valid's auc: 0.988985
[313]	valid's auc: 0.988985
[314]	valid's auc: 0.988993
[315]	valid's auc: 0.988995
[316]	valid's auc: 0.988995
[317]	valid's auc: 0.989007
[318]	valid's auc: 0.989003
[319]	valid's auc: 0.989004
[320]	valid's auc: 0.989003
[321]	valid's auc: 0.989002
[322]	valid's auc: 0.989008
[323]	valid's auc: 0.989013
[324]	valid's auc: 0.98901
[325]	valid's auc: 0.989022
[326]	valid's auc: 0.98903
[327]	valid's auc: 0.989031
[328]	valid's auc: 0.989034
[329]	valid's auc: 0.989034
[330]	valid's auc: 0.989042
[331]	valid's auc: 0.98904
[332]	valid's auc: 0.98

[591]	valid's auc: 0.989351
[592]	valid's auc: 0.989358
[593]	valid's auc: 0.98938
[594]	valid's auc: 0.989378
[595]	valid's auc: 0.989377
[596]	valid's auc: 0.989378
[597]	valid's auc: 0.989378
[598]	valid's auc: 0.98938
[599]	valid's auc: 0.989381
[600]	valid's auc: 0.98938
[601]	valid's auc: 0.989386
[602]	valid's auc: 0.989388
[603]	valid's auc: 0.989385
[604]	valid's auc: 0.989388
[605]	valid's auc: 0.989387
[606]	valid's auc: 0.989386
[607]	valid's auc: 0.989388
[608]	valid's auc: 0.989389
[609]	valid's auc: 0.989392
[610]	valid's auc: 0.989391
[611]	valid's auc: 0.989391
[612]	valid's auc: 0.989393
[613]	valid's auc: 0.989395
[614]	valid's auc: 0.989397
[615]	valid's auc: 0.9894
[616]	valid's auc: 0.989397
[617]	valid's auc: 0.9894
[618]	valid's auc: 0.989403
[619]	valid's auc: 0.989403
[620]	valid's auc: 0.989404
[621]	valid's auc: 0.989405
[622]	valid's auc: 0.989406
[623]	valid's auc: 0.989402
[624]	valid's auc: 0.989404
[625]	valid's auc: 0.989404
[626]	valid's auc: 0.9894
[

# Prediction

In [23]:
print('Predicting...')
test_df['is_attributed'] = model.predict(test_features[features], num_iteration=model.best_iteration)

Predicting...


In [24]:
print('loading test')
test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

loading test


In [25]:
print('merging test_supplement to test')
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']

test = test.merge(test_df[all_cols], how='left', on=join_cols)
test = test.drop_duplicates(subset=['click_id'])

print("Writing the submission data into a csv file...")
test[['click_id', 'is_attributed']].to_csv('submit_lgb.gz', index=False, float_format='%.9f', compression='gzip')
print("All done...")

merging test_supplement to test
Writing the submission data into a csv file...
All done...


In [None]:
#model = lgb.Booster(model_file='model-2018-05-03-09-35-27')
#submit['is_attributed'] = model.predict(test_df[features], num_iteration=616)

In [26]:
del xgtrain
del xgvalid
gc.collect()

35

# Train XGBoost

In [27]:
xgb_params = {'eta': 0.1,
              'tree_method': "hist",
              'grow_policy': "lossguide",
              # 'max_leaves': 1400,  
              'max_depth': 5, 
              'subsample': 0.9, 
              'colsample_bytree': 0.7, 
              'colsample_bylevel':0.7,
              'min_child_weight':0,
              'alpha':4,
              'objective': 'binary:logistic', 
              'scale_pos_weight':9,
              'eval_metric': 'auc', 
              'nthread':24,
              'random_state': 42, 
              'silent': True}

In [28]:
# xgb will handle dataframe directly, great! just tell her which columns are categorial
for col in ['app', 'device', 'os', 'channel', 'hour']:
    train_features[col] = train_features[col].astype('category')
    valid_features[col] = valid_features[col].astype('category')

NameError: name 'train_features' is not defined

In [None]:
dtrain = xgb.DMatrix(train_features, train_labels)
dvalid = xgb.DMatrix(valid_features, valid_labels)
watchlist = [(dvalid, 'valid')]

In [None]:
xgb_model = xgb.train(xgb_params, 
                      dtrain, 
                      num_boost_round=1000, 
                      evals=watchlist, 
                      maximize=True, 
                      early_stopping_rounds = 25, 
                      verbose_eval=5)

plot_importance(model)

In [None]:
print('Predicting...')
test_df['is_attributed'] = xgb_model.predict(test_features[features], ntree_limit=xgb_model.best_ntree_limit)

In [None]:
print('loading test')
test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

In [None]:
print('merging test_supplement to test')
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']

test = test.merge(test_df[all_cols], how='left', on=join_cols)
test = test.drop_duplicates(subset=['click_id'])

print("Writing the submission data into a csv file...")
test[['click_id', 'is_attributed']].to_csv('submit_xgb.gz', index=False, float_format='%.9f', compression='gzip')
print("All done...")