In [1]:
import os
import gc
import time
from time import gmtime, strftime

from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import pandas as pd
%matplotlib inline

# Features

In [2]:
def group_label(df, group_cols):
    col_name = "_".join(group_cols)
    group_idx = df.drop_duplicates(group_cols)[group_cols].reset_index()
    group_idx.rename(columns={'index':col_name}, inplace=True)
    df = df.merge( group_idx, on=group_cols, how='left' )
    del group_idx
    gc.collect()
    return df
    
def count_agg(df, group_cols):
    col_name = "_".join(group_cols)+'_count'
    count = df.groupby(group_cols).size().reset_index(name=col_name)
    df = df.merge(count, on=group_cols, how='left')
    del count
    gc.collect()
    return df

def count_cum(df, group_cols):
    col_name = "_".join(group_cols)+'_countAccum'
    df[col_name] = df.groupby(group_cols).cumcount()
    gc.collect()
    return df

def count_uniq(df, group_cols, uniq_col):
    col_name = "_".join(group_cols)+'_uniq_'+uniq_col+'_countUniq'
    tmp = df.groupby(group_cols)[uniq_col].nunique().reset_index(name=col_name)
    df = df.merge(tmp, on=group_cols, how='left')
    del tmp
    gc.collect()
    return df

def next_click(df, group_cols):
    df["_".join(group_cols)+'_nextClick'] = (df.groupby(group_cols).click_time.shift(-1) - df.click_time).astype(np.float32)
    gc.collect()
    return df

def frequence(df, group_cols):
    col_name = "_".join(group_cols)+'_nextClick'
    clickFreq = df.groupby(group_cols)[col_name].mean().dropna().reset_index(name=("_".join(group_cols)+'_clickFreq'))
    df = df.merge(clickFreq, on=group_cols, how='left')
    del clickFreq
    gc.collect()
    return df

# accumulated count, need sorted df by click time
def count_cum(df, group_cols):
    col_name = "_".join(group_cols)+'_countAccum'
    df[col_name] = df.groupby(group_cols).cumcount()
    return df

In [3]:
def generate_features(df):
    print('generating time features...')
    df['day'] = df['click_time'].dt.day.astype('uint8')
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    df['in_test_hh'] = (3 - 2 * df['hour'].isin([4, 5, 9, 10, 13, 14]) # most frequent
                          - 1 * df['hour'].isin([6, 11, 15])).astype('uint8') # least frequent
    print('done')
    gc.collect()
    
    group_combinations = [
        ['app', 'device'],
        ['app', 'channel']
    ]
    
    count_combinations = [
        ['app'],
        ['ip'], # 3.03
        ['channel'],
        ['os'],
        ['ip', 'device'], # 9.88
        ['day', 'hour', 'app'], # 4.08
        ['app', 'channel'], # 2.8
        ['ip', 'day', 'in_test_hh'], # 1.74
        ['ip', 'day', 'hour'], # 0.52
        ['os', 'device'], # 0.44
        ['ip', 'os', 'day', 'hour'], # 0.41
        ['ip', 'device', 'day', 'hour'], # 0.31
        ['ip', 'app', 'os'] # 0.21
    ]
    
    countUniq_combinations = [
        #[['app'],'ip'],
        #[['app', 'device', 'os', 'channel'], 'ip'],
        [['ip'], 'channel'], # 0.9
        [['ip'], 'app'], # 1.3
        [['ip'], 'os'] # 0.45
    ]
    
    nextClick_combinations = [
        ['ip', 'os'],
        ['ip', 'device', 'os'],
        ['ip', 'app', 'device', 'os'],
        ['ip', 'app', 'device', 'os', 'channel']
    ]
    
    freq_combinations = [
        #['ip', 'app', 'device', 'os']
    ]
    
    accum_combinations = [
        #['app'],
        ['ip'] # 3.03
        #['day', 'hour', 'app']
    ]
    
    # group labels
    for i, cols in enumerate(group_combinations):
        print(i, cols)
        df = group_label(df, cols)
    
    # count features
    for i, cols in enumerate(count_combinations):
        print(i, cols)
        df = count_agg(df, cols)

    # count unique features
    for i, cols in enumerate(countUniq_combinations):
        print(i, cols)
        df = count_uniq(df, cols[0], cols[1])
    
    # next click features
    df['click_time'] = (df['click_time'].astype(np.int64) // 10 ** 9).astype(np.int32)
    for i, cols in enumerate(nextClick_combinations):
        print(i, cols)
        df = next_click(df, cols)
    
    # click frequence
    for i, cols in enumerate(freq_combinations):
        print(i, cols)
        df = frequence(df, cols)
    
    # accum count
    for i, cols in enumerate(accum_combinations):
        print(i, cols)
        df = count_cum(df, cols)
    
    df.drop(['ip', 'click_time', 'day', 'in_test_hh'], axis=1, inplace=True)
    gc.collect()
    print(df.info())
    return df

# Load Data

In [None]:
dtype = {
    'ip' :'uint32',
    'app' :'uint16',
    'device': 'uint16',
    'os' :'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32',
}

# train: (184903890, 7)
# test: (18790469, 7)
train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
train_df = pd.read_csv('data/train.csv', dtype=dtype, usecols=train_cols, parse_dates=['click_time'])

test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
# using test_supplement 
test_df = pd.read_csv('data/test_supplement.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

In [None]:
# combine train and test data
common_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_df = pd.concat([train_df[common_cols], test_df[common_cols]])

In [None]:
# generate data
all_df = generate_features(all_df)

In [None]:
# split train/test features from concated data
train_features = all_df.iloc[:train_df.shape[0]]
test_features = all_df.iloc[train_df.shape[0]:]
gc.collect()

# Train LightGBM Model

In [None]:
metrics = 'auc'
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': metrics,
    'learning_rate': 0.1,
    'num_leaves': 7,
    'max_depth': 4,
    'min_child_samples': 100,
    'max_bin': 100,
    'subsample': 0.7,
    'subsample_freq': 1,
    'colsample_bytree': 0.7,
    'min_child_weight': 0,
    'min_split_gain': 0,
    'nthread': 24,
    'verbose': 1,
    'scale_pos_weight': 200
}

target = 'is_attributed'
features = [col for col in train_features.columns if col not in ['level_0', 'index', 'is_attributed']]
category = ['app', 'device', 'os', 'channel', 'hour']

In [None]:
# train valid split
labels = train_df.is_attributed.values
train_features, valid_features = train_test_split(train_features, test_size=5000000, shuffle=False)
train_labels, valid_labels = train_test_split(labels, test_size=5000000, shuffle=False)
print('Train size:', len(train_features))
print('Valid size:', len(valid_features))
gc.collect()

In [None]:
# convert data into dataset. Warning: Memory Peak
xgtrain = lgb.Dataset(train_features[features].values, 
                      label=train_labels,
                      feature_name=features,
                      categorical_feature=category)

In [None]:
xgvalid = lgb.Dataset(valid_features[features].values, 
                      label=valid_labels,
                      feature_name=features,
                      categorical_feature=category)

In [None]:
print('Training...')
evals_results = {}
model = lgb.train(lgb_params,
                  xgtrain,
                  valid_sets=[xgvalid],
                  valid_names=['valid'],
                  evals_result=evals_results,
                  num_boost_round=2000,
                  early_stopping_rounds=50,
                  verbose_eval=1,
                  feval=None)
n_estimators = model.best_iteration

print('\nModel Info:')
print('n_estimators:', n_estimators)
print(metrics + ':', evals_results['valid'][metrics][n_estimators - 1])

gain = model.feature_importance('gain')
ft = pd.DataFrame({'feature': model.feature_name(), 'split': model.feature_importance('split'),
                   'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
ft.to_csv('feature_importance_ref.csv', index=False)
print(ft)

model_name = 'model-%s' % strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model.save_model(model_name)
print('model saved as %s' % model_name)

# Keep train lgb model

# LGB Prediction

In [None]:
print('Predicting...')
test_df['is_attributed'] = model.predict(test_features[features], num_iteration=model.best_iteration)

In [None]:
print('loading test')
test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

In [None]:
print('merging test_supplement to test')
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']

test = test.merge(test_df[all_cols], how='left', on=join_cols)
test = test.drop_duplicates(subset=['click_id'])

print("Writing the submission data into a csv file...")
test[['click_id', 'is_attributed']].to_csv('submit_lgb_885.gz', index=False, float_format='%.9f', compression='gzip')
print("All done...")

In [None]:
del test
gc.collect()

# Train XGBoost

In [None]:
xgb_params = {'eta': 0.1,
              'tree_method': "hist",
              'grow_policy': "lossguide",
              # 'max_leaves': 1400,  
              'max_depth': 4, 
              'subsample': 0.7, 
              'colsample_bytree': 0.7, 
              'colsample_bylevel':0.7,
              'min_child_weight':0,
              'alpha':0,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc',
              'nthread':24,
              'random_state': 42,
              'scale_pos_weight':200,
              'silent': True}

In [None]:
# train valid split
labels = train_df.is_attributed.values
train_features, valid_features = train_test_split(train_features, train_size=.95, shuffle=False)
train_labels, valid_labels = train_test_split(labels, train_size=.95, shuffle=False)
print('Train size:', len(train_features))
print('Valid size:', len(valid_features))
gc.collect()

In [None]:
dtrain = xgb.DMatrix(train_features, train_labels)
dvalid = xgb.DMatrix(valid_features, valid_labels)
watchlist = [(dvalid, 'valid')]

In [None]:
xgb_model = xgb.train(xgb_params, 
                      dtrain, 
                      num_boost_round=2000, 
                      evals=watchlist, 
                      maximize=True, 
                      early_stopping_rounds = 50, 
                      verbose_eval=5)

In [None]:
xgb.plot_importance(xgb_model)

In [None]:
import operator
sorted(xgb_model.get_fscore().items(), key=operator.itemgetter(1), reverse=True)

# XGB Prediction

In [None]:
model_name = 'xgb-model-%s' % strftime("%Y-%m-%d-%H-%M-%S", gmtime())
xgb_model.save_model(model_name)
print('model saved as %s' % model_name)

In [None]:
dtest = xgb.DMatrix(test_features)

In [None]:
print('Predicting...')
test_df['is_attributed'] = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)

In [None]:
print('loading test')
test = pd.read_csv('data/test.csv', dtype=dtype, usecols=test_cols, parse_dates=['click_time'])

In [None]:
print('merging test_supplement to test')
join_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
all_cols = join_cols + ['is_attributed']

test = test.merge(test_df[all_cols], how='left', on=join_cols)
test = test.drop_duplicates(subset=['click_id'])

print("Writing the submission data into a csv file...")
test[['click_id', 'is_attributed']].to_csv('submit_xgb_895.gz', index=False, float_format='%.9f', compression='gzip')
print("All done...")