## Impressions and Reviews  
- GroupKFold : 특정 변수의 동일한 값 내에서 train test split을 원치 않을 때 (성능 과대평가 방지)

- time 추가변수외에는 추가 X, kfold의 pred값들에 대해 통계값을 기반으로 stacking

- 원소의 개수가 다른 데이터를 dataframe화 할 때 dict_comprehension으로 dict생성후 이를 list로 묶어 dataframe()에 넣으면 됨

- json data라 따로 json format에 따른 전처리가 필요하고 large dataset으로 인해 load가 colab에서 불가 (memory problem)

### Introduction

In this kernel I demonstrate how to create predictions at Session level and then use them at User level so that LighGBM can learn how to better sum individual session prediction. 

It is sort of mini stacker and to avoid leakage, we use GroupKFold strategy.


In [3]:
import os
print(os.listdir("/content/"))

['.config', 'train.csv.zip', 'sample_submission_v2.csv.zip', 'sample_submission.csv.zip', 'train_v2.csv.zip', 'train_v2.csv', 'test.csv.zip', 'test_v2.csv.zip', 'test_v2.csv', 'sample_data']


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import gc
import time
from pandas.core.common import SettingWithCopyWarning
import warnings
import lightgbm as lgb
from sklearn.model_selection import GroupKFold

# I don't like SettingWithCopyWarnings ...
warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()
%matplotlib inline

### Get the extracted data

In [3]:
# memory 부족문제로 only 30000 rows 사용
train = pd.read_csv('/content/train_v2.csv', 
                    dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=30000)
test = pd.read_csv('/content/test_v2.csv', 
                   dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=30000)
train.shape, test.shape

((30000, 13), (30000, 13))

In [5]:
train.head()

Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,"{""browser"": ""Firefox"", ""browserVersion"": ""not ...",3162355547410993243,"{""continent"": ""Europe"", ""subContinent"": ""Weste...","[{'hitNumber': '1', 'time': '0', 'hour': '17',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508198450,1,1508198450
1,Referral,"[{'index': '4', 'value': 'North America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",8934116514970143966,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""referralPath"": ""/a/google.com/transportation...",1508176307,6,1508176307
2,Direct,"[{'index': '4', 'value': 'North America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7992466427990357681,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '17',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""(direct)""...",1508201613,1,1508201613
3,Organic Search,"[{'index': '4', 'value': 'EMEA'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",9075655783635761930,"{""continent"": ""Asia"", ""subContinent"": ""Western...","[{'hitNumber': '1', 'time': '0', 'hour': '9', ...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508169851,1,1508169851
4,Organic Search,"[{'index': '4', 'value': 'Central America'}]",20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6960673291025684308,"{""continent"": ""Americas"", ""subContinent"": ""Cen...","[{'hitNumber': '1', 'time': '0', 'hour': '14',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""2"", ""pageviews"": ""2"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508190552,1,1508190552


### Define folding strategy

In [6]:
# https://ssoondata.tistory.com/29 GroupKFold도표 참고
# fullVisitorId 이 동일한 데이터끼리 train, test가 나누지 않도록 GroupKFold수행
def get_folds(df=None, n_splits=5):
    """Returns dataframe indices corresponding to Visitors Group KFold"""
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids

### Get session target

In [None]:
# target 정의 (결측치 0으로 fill)
y_reg = train['totals.transactionRevenue'].fillna(0)
del train['totals.transactionRevenue']

if 'totals.transactionRevenue' in test.columns:
    del test['totals.transactionRevenue']

### Add date features

Only add the one I think can ganeralize

In [7]:
train.columns

Index(['channelGrouping', 'date', 'device', 'fullVisitorId', 'geoNetwork',
       'sessionId', 'socialEngagementType', 'totals', 'trafficSource',
       'visitId', 'visitNumber', 'visitStartTime'],
      dtype='object')

In [8]:
# 추가변수 생성 : date와 주중요일 index, 시간, day
for df in [train, test]:
    df['date'] = pd.to_datetime(df['visitStartTime'], unit='s')
    df['sess_date_dow'] = df['date'].dt.dayofweek
    df['sess_date_hours'] = df['date'].dt.hour
    df['sess_date_dom'] = df['date'].dt.day

In [11]:
train[['date', 'sess_date_dow', 'sess_date_hours', 'sess_date_dom']].head()

Unnamed: 0,date,sess_date_dow,sess_date_hours,sess_date_dom
0,2016-09-02 15:33:05,4,15,2
1,2016-09-03 05:22:27,5,5,3
2,2016-09-03 01:16:26,5,1,3
3,2016-09-03 05:40:13,5,5,3
4,2016-09-02 13:23:20,4,13,2


### Create features list

In [12]:
# 아래 변수 제외
excluded_features = [
    'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue', 
    'visitId', 'visitStartTime'
]

categorical_features = [
    _f for _f in train.columns
    if (_f not in excluded_features) & (train[_f].dtype == 'object')
]

### Factorize categoricals

In [None]:
# label_encoding과 유사
for f in categorical_features:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

### Predict revenues at session level

In [None]:
folds = get_folds(df=train, n_splits=5)

train_features = [_f for _f in train.columns if _f not in excluded_features]
print(train_features)

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])

# kfold별 lgbm 학습
for fold_, (trn_, val_) in enumerate(folds):
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    reg = lgb.LGBMRegressor(
        num_leaves=31,
        learning_rate=0.03,
        n_estimators=1000,
        subsample=.9,
        colsample_bytree=.9,
        random_state=1
    )
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(val_x, np.log1p(val_y))],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    # train set 내에 val fold에 대해 예측값 저장, 0이하는 0으로
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0

    # test set에 대해 4개 fold로 train한 model로 예측, 0이하는 0
    # fold iter별 예측값을 평균 (예측값_sum/5)
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

### Display feature importances

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

# feature_importance 에 log1p 변환(gain_log)
importances['gain_log'] = np.log1p(importances['gain'])

# fold별 feature_importance mean 계산(mean_gain)
mean_gain = importances[['gain', 'feature']].groupby('feature').mean()
importances['mean_gain'] = importances['feature'].map(mean_gain['gain'])

plt.figure(figsize=(8, 12))
sns.barplot(x='gain_log', y='feature', data=importances.sort_values('mean_gain', ascending=False))

### Create user level predictions

In [None]:
# train set에 대해 kfold별 trained model의 pred값
train['predictions'] = np.expm1(oof_reg_preds)

# test set에 대해 kfold별 trained model의 pred값
test['predictions'] = sub_reg_preds

In [None]:
# Aggregate data at User level
trn_data = train[train_features + ['fullVisitorId']].groupby('fullVisitorId').mean()

In [None]:
%%time
# fullvisitorid를 index로 하고, 같은 id별 pred값들의 dict를 값으로 가지는 df 생성 (trn_pred_list)
# a0b1('fullvisitorid')  {'pred_0': '2', 'pred_1': '8'} (dict_preds)
# 아래셀 참고

trn_pred_list = train[['fullVisitorId', 'predictions']].groupby('fullVisitorId')\
    .apply(lambda df: list(df.predictions))\
    .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})

In [20]:
# oof_reg_preds = np.zeros(train.shape[0]) : trainset에 대한 kfold model의 pred
# sub_reg_preds = np.zeros(test.shape[0])  : testset에 대한 kfold model의 pred
df2 = pd.DataFrame(np.array([['a0b1', 2], 
                             ['c0a1', 5],
                             ['a0b1', 8],
                             ['b0b1', 8]]),
                   columns=['fullVisitorId', 'predictions'])

# 같은 fullvisitorid 별 pred값들을 list로 모아줌 -> pred값들을 list을 dict로 변환

# display(df2[['fullVisitorId', 'predictions']].groupby('fullVisitorId')
#                                              .apply(lambda df: list(df.predictions)))

display(df2[['fullVisitorId', 'predictions']].groupby('fullVisitorId')
                                             .apply(lambda df: list(df.predictions))
                                             .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)}))

fullVisitorId
a0b1    [2, 8]
b0b1       [8]
c0a1       [5]
dtype: object

fullVisitorId
a0b1    {'pred_0': '2', 'pred_1': '8'}
b0b1                   {'pred_0': '8'}
c0a1                   {'pred_0': '5'}
dtype: object

In [None]:
# fullvisitorid별 pred값들에 대한 통계값 계산 (아래셀 참고)
trn_all_predictions = pd.DataFrame(list(trn_pred_list.values), index=trn_data.index)
trn_feats = trn_all_predictions.columns

trn_all_predictions['t_mean'] = np.log1p(trn_all_predictions[trn_feats].mean(axis=1))
trn_all_predictions['t_median'] = np.log1p(trn_all_predictions[trn_feats].median(axis=1))
trn_all_predictions['t_sum_log'] = np.log1p(trn_all_predictions[trn_feats]).sum(axis=1)
trn_all_predictions['t_sum_act'] = np.log1p(trn_all_predictions[trn_feats].fillna(0).sum(axis=1))
trn_all_predictions['t_nb_sess'] = trn_all_predictions[trn_feats].isnull().sum(axis=1)
full_data = pd.concat([trn_data, trn_all_predictions], axis=1)

del trn_data, trn_all_predictions
gc.collect()

full_data.shape

In [23]:
df2 = df2[['fullVisitorId', 'predictions']].groupby('fullVisitorId') \
                                             .apply(lambda df: list(df.predictions)) \
                                             .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})
pd.DataFrame(list(df2.values), index=df2.index)

Unnamed: 0_level_0,pred_0,pred_1
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1
a0b1,2,8.0
b0b1,8,
c0a1,5,


In [None]:
%%time
# testset에 trainset과 동일한 작업 수행
sub_pred_list = test[['fullVisitorId', 'predictions']].groupby('fullVisitorId')\
    .apply(lambda df: list(df.predictions))\
    .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})

In [None]:
sub_data = test[train_features + ['fullVisitorId']].groupby('fullVisitorId').mean()
sub_all_predictions = pd.DataFrame(list(sub_pred_list.values), index=sub_data.index)

for f in trn_feats:
    if f not in sub_all_predictions.columns:
        sub_all_predictions[f] = np.nan
sub_all_predictions['t_mean'] = np.log1p(sub_all_predictions[trn_feats].mean(axis=1))
sub_all_predictions['t_median'] = np.log1p(sub_all_predictions[trn_feats].median(axis=1))
sub_all_predictions['t_sum_log'] = np.log1p(sub_all_predictions[trn_feats]).sum(axis=1)
sub_all_predictions['t_sum_act'] = np.log1p(sub_all_predictions[trn_feats].fillna(0).sum(axis=1))
sub_all_predictions['t_nb_sess'] = sub_all_predictions[trn_feats].isnull().sum(axis=1)
sub_full_data = pd.concat([sub_data, sub_all_predictions], axis=1)

del sub_data, sub_all_predictions
gc.collect()
sub_full_data.shape

### Create target at Visitor level

In [None]:
train['target'] = y_reg
trn_user_target = train[['fullVisitorId', 'target']].groupby('fullVisitorId').sum()

### Train a model at Visitor level

In [None]:
# X = fullvisitorid별 pred값들에 대한 통계값들
# y = fullvisitorid별 true값들의 sum (totalpagerevenue)
folds = get_folds(df=full_data[['totals.pageviews']].reset_index(), n_splits=5)

oof_preds = np.zeros(full_data.shape[0])
sub_preds = np.zeros(sub_full_data.shape[0])
vis_importances = pd.DataFrame()

for fold_, (trn_, val_) in enumerate(folds):
    trn_x, trn_y = full_data.iloc[trn_], trn_user_target['target'].iloc[trn_]
    val_x, val_y = full_data.iloc[val_], trn_user_target['target'].iloc[val_]
    
    reg = lgb.LGBMRegressor(
        num_leaves=31,
        learning_rate=0.03,
        n_estimators=1000,
        subsample=.9,
        colsample_bytree=.9,
        random_state=1
    )
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(trn_x, np.log1p(trn_y)), (val_x, np.log1p(val_y))],
        eval_names=['TRAIN', 'VALID'],
        early_stopping_rounds=50,
        eval_metric='rmse',
        verbose=100
    )
    
    imp_df = pd.DataFrame()
    imp_df['feature'] = trn_x.columns
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    vis_importances = pd.concat([vis_importances, imp_df], axis=0, sort=False)
    
    # oof_preds : 모델 평가용
    oof_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_preds[oof_preds < 0] = 0
    
    # testset에 대한 pred 수행 및 fold별 pred값들의 평균값 사용
    _preds = reg.predict(sub_full_data[full_data.columns], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_preds += _preds / len(folds)
    
mean_squared_error(np.log1p(trn_user_target['target']), oof_preds) ** .5

### Display feature importances

In [None]:
# 변수중요도 시각화
vis_importances['gain_log'] = np.log1p(vis_importances['gain'])
mean_gain = vis_importances[['gain', 'feature']].groupby('feature').mean()
vis_importances['mean_gain'] = vis_importances['feature'].map(mean_gain['gain'])

plt.figure(figsize=(8, 25))
sns.barplot(x='gain_log', y='feature', data=vis_importances.sort_values('mean_gain', ascending=False).iloc[:300])

### Save predictions

In [None]:
sub_full_data['PredictedLogRevenue'] = sub_preds
sub_full_data[['PredictedLogRevenue']].to_csv('new_test.csv', index=True)