In [1]:
import gc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb

dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32',
    }

print('Loading train.csv...')
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
train_df = pd.read_csv('./data/train.csv', skiprows=range(1,64903891), nrows=120000000, dtype=dtypes, usecols=train_cols, parse_dates=['click_time'])

print('Loading test_supplement.csv...')
test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_id', 'click_time']
test_df = pd.read_csv("./data/test_supplement.csv", dtype=dtypes, usecols=test_cols, parse_dates=['click_time'])

print('Preprocessing...')

def add_counts(df, cols):
    agg_name = "_".join(cols)+"_count"
    arr_slice = df[cols].values
    _, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(axis=0)+1),
                                     return_inverse=True, return_counts=True)
    df[agg_name] = counts[unqtags].astype('uint32')
    del arr_slice
    del unqtags
    del counts
    gc.collect()

def do_countuniq(df, group_cols, counted):
    agg_name= '{}_by_{}_countuniq'.format(('_'.join(group_cols)), (counted))
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp, how='left', on=group_cols)
    del gp
    df[agg_name] = df[agg_name].astype('uint32')
    gc.collect()
    return df

def add_next_click(df):
    df['click_time'] = (df['click_time'].astype('int64') // 10 ** 9).astype('int32')
    df['next_click'] = (df.groupby(['ip', 'app', 'device', 'os', 'channel']).click_time.shift(-1) - df.click_time).astype('float32')
    df['next_click2'] = (df.groupby(['ip', 'app', 'device', 'os']).click_time.shift(-1) - df.click_time).astype('float32')
    df['next_click3'] = (df.groupby(['ip', 'device', 'os']).click_time.shift(-1) - df.click_time).astype('float32')
    df['click_time'] = pd.to_datetime(df['click_time'].astype('int64') * 10 ** 9)
    gc.collect()

def preproc_data(df):
    
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    gc.collect()

    print('Adding next_click features...')
    add_next_click(df)
    print(df.info())

    print('Adding counts features...')
    
    df = do_countuniq(df, ['ip'], 'channel')
    df = do_countuniq(df, ['ip'], 'app')
    df = do_countuniq(df, ['ip', 'device', 'os'], 'app')
    print(df.info())    

    add_counts(df, ['ip'])
    add_counts(df, ['os', 'app', 'channel'])
    print(df.info())

    add_counts(df, ['ip', 'hour'])
    add_counts(df, ['ip', 'os', 'hour'])
    add_counts(df, ['ip', 'device'])
    print(df.info())

    add_counts(df, ['ip', 'app', 'os'])
    add_counts(df, ['hour', 'app'])
    add_counts(df, ['channel', 'app'])
    print(df.info())

    return df

y = train_df.is_attributed.values

train_len = len(train_df)
common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
train_df = pd.concat([train_df[common_cols], test_df[common_cols]])

train_df = preproc_data(train_df)

test_df = train_df.iloc[train_len:]
train_df = train_df.iloc[:train_len]

gc.collect()

lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.15,
    'num_leaves': 24,
    'max_depth': 4,
    'min_child_samples': 80,
    'max_bin': 100,
    'subsample': 0.65,
    'subsample_freq': 1,
    'colsample_bytree': 0.65,
    'min_child_weight': 0,
    'min_split_gain': 0,
    'nthread': 4,
    'verbose': 1,
    'scale_pos_weight': 50
}

target = 'is_attributed'

inputs = list(set(train_df.columns) - set([target, 'ip', 'click_time']))  
cat_vars = ['app', 'device', 'os', 'channel', 'hour']

train_df, val_df = train_test_split(train_df, train_size=0.98, shuffle=False)
y_train, y_val = train_test_split(y, train_size=0.98, shuffle=False)

print('Train size:', len(train_df))
print('Valid size:', len(val_df))

gc.collect()

print('Training...')

xgtrain = lgb.Dataset(train_df[inputs].values, label=y_train, feature_name=inputs, categorical_feature=cat_vars)
del train_df
gc.collect()

xgvalid = lgb.Dataset(val_df[inputs].values, label=y_val, feature_name=inputs, categorical_feature=cat_vars)
del val_df
gc.collect()

eval_result = {}

model = lgb.train(lgb_params,
          xgtrain,
          valid_sets= [xgtrain, xgvalid],
          valid_names=['train', 'valid'],
          num_boost_round=1000,
          early_stopping_rounds=100,
          evals_result=eval_result,
          learning_rates=lambda iter: 0.15 if iter <=250 else 0.05,
          verbose_eval=10)

print('Plotting feature importances...')
lgb.plot_importance(model)
plt.savefig('./results/feature_importance.png', dpi=600, bbox_inches="tight")
plt.close()

print('Plotting auc curve...')
lgb.plot_metric(eval_result, metric='auc')
plt.savefig('./results/auc_curve.png', dpi=600, bbox_inches="tight")
plt.close()

print('Plotting tree...')
lgb.plot_tree(model)
plt.savefig('./results/tree.png', dpi=1200, bbox_inches="tight")
plt.close()

del xgtrain
del xgvalid
gc.collect()

print('Predicting...')
test_df['is_attributed'] = model.predict(test_df[inputs])

print('Projecting prediction onto test.csv...')
test = pd.read_csv("./data/test.csv", dtype=dtypes, usecols=test_cols, parse_dates=['click_time'])

join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']

test = test.merge(test_df[all_cols], how='left', on=join_cols)
test = test.drop_duplicates(subset=['click_id'])

print('Creating output file...')
test[['click_id', 'is_attributed']].to_csv('./results/sokazaki.csv', index=False, float_format='%.9f')
print('Done!')

# public LB: 0.9808138, private LB: 0.9816301

Loading train.csv...
Loading test_supplement.csv...
Preprocessing...
Adding next_click features...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 177537505 entries, 0 to 57537504
Data columns (total 10 columns):
ip             uint32
app            uint16
device         uint16
os             uint16
channel        uint16
click_time     datetime64[ns]
hour           uint8
next_click     float32
next_click2    float32
next_click3    float32
dtypes: datetime64[ns](1), float32(3), uint16(4), uint32(1), uint8(1)
memory usage: 6.8 GB
None
Adding counts features...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 177537505 entries, 0 to 177537504
Data columns (total 13 columns):
ip                               uint32
app                              uint16
device                           uint16
os                               uint16
channel                          uint16
click_time                       datetime64[ns]
hour                             uint8
next_click                   



Train size: 117600000
Valid size: 2400000
Training...




Training until validation scores don't improve for 100 rounds.
[10]	train's auc: 0.970205	valid's auc: 0.977943
[20]	train's auc: 0.976484	valid's auc: 0.981184
[30]	train's auc: 0.979289	valid's auc: 0.984237
[40]	train's auc: 0.980937	valid's auc: 0.985851
[50]	train's auc: 0.982034	valid's auc: 0.987108
[60]	train's auc: 0.982933	valid's auc: 0.988398
[70]	train's auc: 0.983406	valid's auc: 0.989111
[80]	train's auc: 0.983838	valid's auc: 0.989528
[90]	train's auc: 0.984143	valid's auc: 0.989786
[100]	train's auc: 0.984372	valid's auc: 0.990028
[110]	train's auc: 0.98457	valid's auc: 0.990104
[120]	train's auc: 0.984703	valid's auc: 0.990188
[130]	train's auc: 0.984848	valid's auc: 0.990246
[140]	train's auc: 0.984946	valid's auc: 0.990352
[150]	train's auc: 0.98505	valid's auc: 0.990394
[160]	train's auc: 0.98514	valid's auc: 0.990364
[170]	train's auc: 0.985225	valid's auc: 0.990408
[180]	train's auc: 0.985294	valid's auc: 0.990445
[190]	train's auc: 0.98537	valid's auc: 0.990525
