## Regressors and Classification by Optimal Rounding
**■Classification steps**<BR>
**Step 1** Create Regressor Models : Create multiple train_datasets using `kFold` and create a regression model from each dataset. I used ** CatBoost **, ** XGBoost **, ** LightGBM **.<BR>
**Step 2** Predict each Model<BR>
**Step 3** Optimize Rounding Coefficients : The rounding coefficient is optimized using the average value of the prediction results of each model. Optimization uses `scipy.optimize.minimize()`.<BR>
**Step 4** Final Classification

In [None]:
import os, sys
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import cohen_kappa_score
import category_encoders as ce

from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

from functools import partial
import scipy as sp              # for optimize.minimize()

In [None]:
# Execution environment setting
Kaggle = True

if Kaggle:
    DIR = '../input/data-science-bowl-2019'
    task_type = 'CPU'
else:
    DIR = './data-science-bowl-2019'
    task_type = 'GPU'

## Observe the data

In [None]:
train = pd.read_csv(os.path.join(DIR,'train.csv'))
#train_labels = pd.read_csv(os.path.join(DIR,'train_labels.csv'))
#specs = pd.read_csv(os.path.join(DIR,'specs.csv'))
test = pd.read_csv(os.path.join(DIR,'test.csv'))
#media_seq = pd.read_csv(os.path.join('../input/dsb2019-external-data','media_sequence.csv'))

In [None]:
#print('train:\t\t',train.shape)
#print('train_labels:\t',train_labels.shape)
#print('specs:\t\t',specs.shape)
#print('test:\t\t',test.shape)
#print('media_seq:\t\t',media_seq.shape)

### 1. train

In [None]:
#train.head()

In [None]:
#event_code_n = train['event_code'].nunique()
#print("num of unique 'event_code':", event_code_n)
#print("'event_code': ",
#      train['event_code'].min(), "-", train['event_code'].max())

In [None]:
# 'event_data' exsample
#print(train['event_data'][40])
#print(train['event_data'][41])
#print(train['event_data'][43])

### 2. train_labels

In [None]:
#train_labels.head()

In [None]:
#train_labels[['game_session','installation_id', 'title']].describe()

In [None]:
# unique 'title' list
#train_labels['title'].unique()

### 3. specs

In [None]:
#specs.head()

In [None]:
#specs.describe()

In [None]:
# 'info' exsample
#print(specs['info'][0],'\n')
#print(specs['info'][6],'\n')
#print(specs['info'][7])

In [None]:
# 'args' exsample
#print(specs['args'][0],'\n')
#print(specs['args'][1])

### 4. test

In [None]:
#test.head(8)
#test['world'].unique()

In [None]:
#test[['event_id','game_session','installation_id',
#       'title','type','world']].describe()

In [None]:
#' malke 'title' and 'event_code' list
title_list = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
event_code_list = list(set(train['event_code'].value_counts().index).union(set(test['event_code'].value_counts().index)))
event_id_list = list(set(train['event_id'].value_counts().index).union(set(test['event_id'].value_counts().index)))


In [None]:
# make dict 'title to number(integer)'
title2num = dict(zip(title_list, np.arange(len(title_list))))
# meke dict 'number to title'
num2title = dict(zip(np.arange(len(title_list)), title_list))

# meke dict 'title to win event_code'
# (4100 except 'Bird Measurer' and 4110 for 'Bird Measurer')
title2win_code = dict(zip(title2num.values(), (np.ones(len(title2num))).astype('int') * 4100))
title2win_code[title2num['Bird Measurer (Assessment)']] = 4110

In [None]:
# Convert 'title to the number'
train['title']  = train['title'].map(title2num)
test['title'] = test['title'].map(title2num)
#train_labels['title'] = train_labels['title'].map(title2num)

#Convert 'timestamp' to datetime
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(train['timestamp'])

In [None]:
#train = train[train['installation_id'].isin(train_labels['installation_id'].unique())]

In [None]:
clip_time = {'Welcome to Lost Lagoon!':19,'Tree Top City - Level 1':17,'Ordering Spheres':61, 'Costume Box':61,
        '12 Monkeys':109,'Tree Top City - Level 2':25, 'Pirate\'s Tale':80, 'Treasure Map':156,'Tree Top City - Level 3':26,
        'Rulers':126, 'Magma Peak - Level 1':20, 'Slop Problem':60, 'Magma Peak - Level 2':22, 'Crystal Caves - Level 1':18,
        'Balancing Act':72, 'Lifting Heavy Things':118,'Crystal Caves - Level 2':24, 'Honey Cake':142, 'Crystal Caves - Level 3':19,
        'Heavy, Heavier, Heaviest':61}

In [None]:
#user_sample = train[train['installation_id']=='0006a69f']
#user_sample.head()
#session = user_sample[user_sample['game_session'] == '2b9d5af79bcdb79f']
#session.head()
#num2title

In [None]:
#clip_time[num2title[session.title.iloc[0]]]

In [None]:
def agr_session(user_sample):
    '''
    user_sample : DataFrame from train/test group by 'installation_id'
    '''
    session_agr = []

    for i, session in user_sample.groupby(['game_session'],sort = False):
        session = session.sort_values('timestamp')
        event_code_count = {code:0 for code in event_code_list}
        event_codes = Counter(session['event_code'])
        for key in event_codes.keys():
            event_code_count[key] += event_codes[key]
        session_grp = event_code_count.copy()
        
        event_id_count = {code:0 for code in event_id_list}
        event_ids = Counter(session['event_id'])
        for key in event_ids.keys():
            event_id_count[key] += event_ids[key]
        session_grp.update(event_id_count.copy())

        session_grp['installation_id'] = session['installation_id'].iloc[0]
        session_grp['game_session'] = session['game_session'].iloc[0]
        session_title = session['title'].iloc[0]  # Game/Assessment/Activity/Clip
        session_grp['title'] = session_title
        session_type = session['type'].iloc[0]  # Game/Assessment/Activity/Clip
        session_grp['type'] = session_type
        session_grp['world'] = session['world'].iloc[0]
        session_grp['timestamp_st'] = session.iloc[0,2]
        session_grp['timestamp_en'] = session.iloc[-1,2]
        session_grp['event_count'] = len(session)
        if session_type == 'Clip':
            session_grp['game_time_sum'] = clip_time[num2title[session_title]]*1000
        else:
            session_grp['game_time_sum'] = session['game_time'].iloc[-1]
        game_time_diff = session['game_time'].diff(1)
        game_time_diff_nonzero = game_time_diff[game_time_diff!=0][~game_time_diff.isnull()]
        session_grp['game_time_nonzeros'] = len(game_time_diff_nonzero)
        #session_grp['game_time_ave'] = game_time_diff.mean(skipna=True)
        #session_grp['game_time_ave_nonzero'] = game_time_diff_nonzero.mean(skipna=True)
        #session_grp['game_time_max'] = game_time_diff.max()
        #session_grp['game_time_min_nonzero'] = game_time_diff_nonzero.min()
        #session_grp['game_time_std'] = game_time_diff.std()
        #session_grp['game_time_std_nonzero'] = game_time_diff_nonzero.std()
        
        if session_type == 'Assessment':
            #search for event_code 4100(4110)
            all_4100 = session.query(f'event_code == {title2win_code[session_title]}')
            #numbers of win and losses
            time_to_first_ans = all_4100['game_time'].min()
            time_to_final_ans = all_4100['game_time'].max()
            win_n = all_4100['event_data'].str.contains('true').sum()
            loss_n = all_4100['event_data'].str.contains('false').sum()
            accuracy = (win_n)/ (win_n + loss_n) if (win_n + loss_n) > 0 else 0
            
            if accuracy == 0:
                accuracy_group = 0
            elif accuracy == 1:
                accuracy_group = 3
            elif accuracy == 0.5:
                accuracy_group = 2
            else:
                accuracy_group = 1

        else:
            time_to_first_ans = np.nan
            time_to_final_ans = np.nan
            win_n = np.nan
            loss_n = np.nan
            accuracy = np.nan
            accuracy_group = np.nan
        
        session_grp['time_to_first_ans'] = time_to_first_ans
        session_grp['time_to_final_ans'] = time_to_final_ans
        session_grp['win_n'] = win_n
        session_grp['loss_n'] = loss_n  
        session_grp['accuracy'] = accuracy
        session_grp['accuracy_group'] = accuracy_group
        
        session_agr.append(session_grp) 
    return session_agr


In [None]:
# get_data function is applyed to each installation_id
compiled_data = []
installation_n = train['installation_id'].nunique()
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby( \
                                     'installation_id', sort=False)),
                                     total=installation_n):
    # user_sample : DataFrame group by 'installation_id'
    compiled_data += agr_session(user_sample)

In [None]:
del train
train_agr = pd.DataFrame(compiled_data)

In [None]:
# process test set, the same that was done with the train set
compiled_data = []
for ins_id, user_sample in tqdm(test.groupby('installation_id',sort=False),
                                total=1000):
    compiled_data += agr_session(user_sample)    
test_agr = pd.DataFrame(compiled_data)
del compiled_data
del test

In [None]:
#ase = train_agr[train_agr['type']=='Assessment']
#ase = ase[ase['installation_id']=='0006a69f']
#e7e7db2a241eadcc
#assess_hist = ase.iloc[0:2]
#print(assess_hist)
#assess_info = ase.iloc[1]
#assess_info

In [None]:
#assess_hist

In [None]:
#same_assess = assess_hist[assess_hist['title']==assess_info['title']]
#
#assess_hist['title']==assess_info['title']
#same_assess = same_assess[same_assess['timestamp_st']<assess_info['timestamp_st']]
#len(same_assess)
#train_labels[train_labels['game_session']=='e7e7db2a241eadcc']

In [None]:
#types_count = {'Clip':0, 'Activity':0, 'Assessment':0, 'Game':0}
#ase = train_agr[train_agr['type']=='Assessment']

#user_sample = train_agr[train_agr['installation_id']=='0006a69f']
#user_sample = user_sample.sort_values('timestamp_st')
#assessment_inst = user_sample[user_sample['type']=='Assessment']
#assess_time =assessment_inst.timestamp_st.iloc[0]
#assess_sample = user_sample[user_sample['timestamp_st'] <= assess_time]
#assess_sample = assess_sample.sort_values('timestamp_st')
#assess_sample = assess_sample[assess_sample['timestamp_st'] < assess_time]

#assess_sample
#ase

In [None]:
#assess_sample[assess_sample['type']=='Activity'].event_count.sum()

In [None]:
#def change_dic_key(d,old_key,new_key,default_value=None):
#    d[new_key] = d.pop(old_key,default_value)

In [None]:
#types_count = {'Clip':0, 'Activity':0, 'Assessment':0, 'Game':0}
#for key in list(types_count.keys()):
#    types_count[key+'_count'] = types_count[key]
#    del types_count[key]
#types_count

In [None]:
def preprocess_data(user_sample,train_set = True):
    prep_data = []
    user_sample = user_sample.sort_values('timestamp_st')
   # print(user_sample.installation_id.iloc[0])
    assessment_inst = user_sample[user_sample['type']=='Assessment']
    accumu_accuracy_group = 0
    accumu_accuracy=0
    accumu_win_n = 0 
    accumu_loss_n = 0 
    counter=0
    accuracy_groups = {0.0:0, 1.0:0, 2.0:0, 3.0:0}
    durations = []
    for assess_time in assessment_inst['timestamp_st']:
        #print(assess_time)
        assess_sample = user_sample[user_sample['timestamp_st'] <= assess_time]
        assess_sample = assess_sample.sort_values('timestamp_st')

        time_spent_each_title = {title:0 for title in title_list}
        event_code_count = {code:0 for code in event_code_list}
        event_id_count = {code:0 for code in event_id_list}
        types_time = {'Clip':0, 'Activity':0, 'Assessment':0, 'Game':0}
        world_time = {'MAGMAPEAK':0, 'NONE':0, 'CRYSTALCAVES':0, 'TREETOPCITY':0}
        types_count = {'Clip':0, 'Activity':0, 'Assessment':0, 'Game':0}
        world_count = {'MAGMAPEAK':0, 'NONE':0, 'CRYSTALCAVES':0, 'TREETOPCITY':0}
        
        assess_hist = assess_sample[assess_sample['type']=='Assessment']
        assess_info = assess_hist.iloc[-1]
        assess_count = len(assess_hist)
        assess_sample = assess_sample[assess_sample['timestamp_st'] < assess_time]
        for key in event_code_count.keys():
            event_code_count[key] = assess_sample[key].sum()
        for key in event_id_count.keys():
            event_id_count[key] = assess_sample[key].sum()
        for key in types_count.keys():
            types_count[key] = assess_sample[assess_sample['type']==key].event_count.sum()
        for key in list(types_count.keys()):
            types_count[key+'_count'] = types_count[key]
            del types_count[key]
      
        for key in world_count.keys():
            world_count[key] = assess_sample[assess_sample['world']==key].event_count.sum()
        for key in list(world_count.keys()):
            world_count[key+'_count'] = world_count[key]
            del world_count[key]

        assess_sample = user_sample[user_sample['timestamp_st'] <= assess_time]
        features = event_code_count.copy()
        features.update(event_id_count.copy())
        features.update(types_count.copy())
        features.update(world_count.copy())
        #features['installation_id'] = assess_info['installation_id']
        features['title'] = assess_info['title']
        features['world'] = assess_info['world']
        features['assess_count'] = assess_count
        features['unique_assess'] = assess_hist.title.nunique()
        features['unique_title'] = assess_sample.title.nunique()
        features['unique_world'] = assess_sample.world.nunique()
        
        features['dayofweek'] = assess_info['timestamp_st'].dayofweek
        #features['weekofyear'] = assess_info['timestamp_st'].weekofyear
        #features['month'] = assess_info['timestamp_st'].month
        #features['day'] = assess_info['timestamp_st'].day
        features['hour'] = assess_info['timestamp_st'].hour
        #features['time_to_asess'] = (assess_sample['timestamp_st'].iloc[-1] - assess_sample['timestamp_st'].iloc[0]).seconds
        interval = assess_sample['timestamp_st'].shift(-1)-assess_sample['timestamp_en']
        t_seconds = lambda x: x.seconds
        interval = interval.map(t_seconds)
        if len(interval)==1:
            interval_before_assess=0
        else:
            interval_before_assess=interval.iloc[-2]
        
        #features['interval_before_assess'] = interval_before_assess
        features['interval_ave'] = interval.mean()
        features['interval_min'] = interval.min()
        features['interval_max'] = interval.max()

        assess_sample = assess_sample[assess_sample['timestamp_st'] < assess_time]
        features['session_count'] = len(assess_sample)
        features['event_count_mean'] = assess_sample[assess_sample['event_count']>1]['event_count'].mean()
        features['event_count_max'] = assess_sample[assess_sample['event_count']>1]['event_count'].max()
        features['event_count_min'] = assess_sample[assess_sample['event_count']>1]['event_count'].min()
        features['event_count_std'] = assess_sample[assess_sample['event_count']>1]['event_count'].std()
        features['accum_actions'] = assess_sample.event_count.sum()
        features['accum_game_time'] = assess_sample.game_time_sum.sum()
        features['game_time_nonzero_count'] = assess_sample.game_time_nonzeros.sum()
        
        
        game_time_title = assess_sample.groupby(['title'])['game_time_sum'].sum()
        for key in game_time_title.keys():
            time_spent_each_title[num2title[key]] += game_time_title[key]
        features.update(time_spent_each_title.copy())
        game_time_type = assess_sample.groupby(['type'])['game_time_sum'].sum()
        for key in game_time_type.keys():
            types_time[key] = game_time_type[key]
        features.update(types_time.copy())

        game_time_world = assess_sample.groupby(['world'])['game_time_sum'].sum()
        for key in game_time_world.keys():
            world_time[key] = game_time_world[key]
        features.update(world_time.copy())
        
        features['accumu_win_n'] = accumu_win_n
        features['accumu_loss_n'] = accumu_loss_n
        accumu_win_n += assess_info['win_n']
        accumu_loss_n += assess_info['loss_n']
        
        features.update(accuracy_groups)
        #if (np.isnan(accuracy_groups[assess_info['accuracy_group']]))==False:
        accuracy_groups[assess_info['accuracy_group']] += 1
        features['accuracy_ave'] = accumu_accuracy / counter \
                                                if counter > 0 else 0
        accumu_accuracy += assess_info['accuracy']
        features['accuracy_group_ave'] = \
                    accumu_accuracy_group / counter if counter > 0 else 0
        accumu_accuracy_group +=  assess_info['accuracy_group']
        counter +=1
        
        if durations == []:
            features['duration_mean'] = 0
            features['duration_std'] = 0
            features['duration_max'] = 0
        else:
            features['duration_mean'] = np.mean(durations)
            features['duration_mean'] = np.std(durations)
            features['duration_max'] = np.max(durations)

        durations.append(assess_info['game_time_sum'])
        
        if assess_count > 1:
            last_title = assess_hist['title'].iloc[-2]
            last_win_n = assess_hist['win_n'].iloc[-2]
            last_loss_n = assess_hist['loss_n'].iloc[-2]
            last_accuracy = assess_hist['accuracy'].iloc[-2]
            last_accuracy_group = assess_hist['accuracy_group'].iloc[-2]
            interval_from_last_assess = ( assess_hist['timestamp_st'].iloc[-1] - assess_hist['timestamp_st'].iloc[-2] ).seconds
        else:
            last_title = np.nan
            last_win_n = np.nan
            last_loss_n = np.nan
            last_accuracy = np.nan
            last_accuracy_group = np.nan
            interval_from_last_assess = np.nan

        features['last_title'] = last_title
        features['last_win_n'] = last_win_n
        features['last_loss_n'] = last_loss_n
        features['last_accuracy'] = last_accuracy
        features['last_accuracy_group'] = last_accuracy_group
        #features['interval_from_last_assess'] = interval_from_last_assess

        #features['same_accuracy_ave']=0
        #features['same_accuracy_max']=0
        #features['same_accuracy_min']=0
        features['same_accuracy_try']=0
        features['same_accuracy_game_time']=0
        features['same_accuracy_event_count']=0
        if len(assess_hist)>1:
            same_assess = assess_hist[assess_hist['title']==assess_info['title']]
            same_assess = same_assess[same_assess['timestamp_st']<assess_info['timestamp_st']]
            if len(same_assess)>0:
                #features['same_accuracy_ave']=same_assess.accuracy.mean()
                #features['same_accuracy_max']=same_assess.accuracy.max()
                #features['same_accuracy_min']=same_assess.accuracy.min()
                features['same_accuracy_try']=len(same_assess)
                features['same_accuracy_game_time']=same_assess.game_time_sum.sum()
                features['same_accuracy_event_count']=same_assess.event_count.sum()

        
        if train_set:
            features['accuracy_group'] = assess_info['accuracy_group']
        if train_set==False or (assess_info['win_n'] + assess_info['loss_n']) > 0:
            prep_data.append(features)

    if train_set==False:
        return prep_data[-1]
    return prep_data
#features

In [None]:
#train_data = preprocess_data(user_sample)
#train_data.head()
installation_n = train_agr['installation_id'].nunique()
train_data = []
for ins_id, user_sample in tqdm(train_agr.groupby('installation_id',sort=False),
                                total=installation_n):
    train_data += preprocess_data(user_sample)

In [None]:
#train_data = pd.DataFrame(train_data)
#train_data.head()

In [None]:
train_data = pd.DataFrame(train_data)
del train_agr

In [None]:
train_data.groupby('accuracy_group').size()

In [None]:
# process test set, the same that was done with the train set
test_data = []
for ins_id, user_sample in tqdm(test_agr.groupby('installation_id',sort=False),
                                total=1000):
    #print(user_sample.installation_id.iloc[0])
    test_data.append(preprocess_data(user_sample,train_set = False))
test_data = pd.DataFrame(test_data)
#del test_agr

In [None]:
# all_features but 'accuracy_group', that is the label y
all_features = [x for x in train_data.columns if x not in ['accuracy_group']]
# categorical feature
categorical_features = ['world']

In [None]:
train_data.head()

In [None]:
#object_cols = train_data.columns[train_data.dtypes == 'object']
#for object_col in object_cols:
#    train_data[object_col] = train_data[object_col].astype('float')

In [None]:
#test_agr[test_agr['type']=='Assessment'].head(30)
#user_sample = test_agr[test_agr['installation_id']=='12771ee9']
#user_sample = user_sample.sort_values('timestamp_st')
#assessment_inst = user_sample[user_sample['type']=='Assessment']

#print(assessment_inst)
 
#test_agr[test_agr['type']=='Assessment'].iloc[1]['accuracy_group'].isnan()
#aa['accuracy_group']

In [None]:
# concatnate train and test data
temp_df = pd.concat([train_data[all_features], test_data[all_features]])
#temp_df = temp_df.drop('day',axis=1)

# encode
encoder = ce.ordinal.OrdinalEncoder(cols = categorical_features)
temp_df = encoder.fit_transform(temp_df)
#tarain_data = encoder.fit_transform(train_data)
# dataset
X, y = temp_df.iloc[:len(train_data),:], train_data['accuracy_group']
#X, y = train_data[all_features], train_data['accuracy_group']
X_test = temp_df.iloc[len(train_data):,:]

In [None]:
# Create multiple datasets to create multiple models (not for CV).
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
#groups = X.installation_id
#folds = GroupKFold(n_splits=NFOLDS)
#X = X.drop('installation_id',axis =1)

In [None]:
# LightGBM
start_time = time()
lgb_models = []
scores = []

params = {
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
    'objective': 'regression',
#     'num_iteration': 100,           # add
    'metric': 'rmse',
#     'eval_metric': 'cappa',
    'feature_fraction':0.998495,    # add
    'bagging_fraction': 0.872417,   # mod 0.8→, = subsample
    'learning_rate': 0.02,
    'feature_fraction': 0.9,        #   = colsample_bytree
    'max_depth': 13,                # mod 10→
    'num_leaves': 1028,             # mod   # 2^max_depth < num_leaves
    'min_gain_to_split':0.085502,   # add
    'min_child_weight':1.087712,    # add
    'lambda_l1': 1,  
    'lambda_l2': 1,
    'verbose': 100,
}

# Train and make models
for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
#for fold, (train_ids, val_ids) in enumerate(folds.split(X,y,groups)):
    print('● Fold :', fold+1,'/',NFOLDS)
    train_set = lgb.Dataset(X.iloc[train_ids], y.iloc[train_ids],
                           categorical_feature=categorical_features)
    val_set = lgb.Dataset(X.iloc[val_ids], y.iloc[val_ids],
                         categorical_feature=categorical_features)
    model = lgb.train(params=params,
                      train_set=train_set,
                      valid_sets=[train_set, val_set],
                      num_boost_round=5000,
                      early_stopping_rounds=100,    # del
                      verbose_eval=200
                     )
    lgb_models.append(model)
    
print('\nTime:', time() - start_time)

In [None]:
importance  = pd.DataFrame(model.feature_importance(),index = train_set.feature_name, columns=['importance']).sort_values(by =  'importance',ascending=False)
importance[0:50]

In [None]:
fig, ax = plt.subplots(figsize=(10, 15))
lgb.plot_importance(model, max_num_features=50, ax=ax, importance_type='gain') # 'gaiのn'他に'split'がある。

In [None]:
imp = model.feature_importance(importance_type='gain') # importancをenumpy arrayで受け取る
th =500
use_col = X.columns[imp > th]
X = X[use_col]
X_test = X_test[use_col]

In [None]:
# LightGBM
start_time = time()
lgb_models = []
scores = []

params = {
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
    'objective': 'regression',
#     'num_iteration': 100,           # add
    'metric': 'rmse',
#     'eval_metric': 'cappa',
    'feature_fraction':0.998495,    # add
    'bagging_fraction': 0.872417,   # mod 0.8→, = subsample
    'learning_rate': 0.02,
    'feature_fraction': 0.9,        #   = colsample_bytree
    'max_depth': 13,                # mod 10→
    'num_leaves': 1028,             # mod   # 2^max_depth < num_leaves
    'min_gain_to_split':0.085502,   # add
    'min_child_weight':1.087712,    # add
    'lambda_l1': 1,  
    'lambda_l2': 1,
    'verbose': 100,
}

# Train and make models
for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
#for fold, (train_ids, val_ids) in enumerate(folds.split(X,y,groups)):
    print('● Fold :', fold+1,'/',NFOLDS)
    train_set = lgb.Dataset(X.iloc[train_ids], y.iloc[train_ids],
                           categorical_feature=categorical_features)
    val_set = lgb.Dataset(X.iloc[val_ids], y.iloc[val_ids],
                         categorical_feature=categorical_features)
    model = lgb.train(params=params,
                      train_set=train_set,
                      valid_sets=[train_set, val_set],
                      num_boost_round=5000,
                      early_stopping_rounds=100,    # del
                      verbose_eval=200
                     )
    lgb_models.append(model)
    
print('\nTime:', time() - start_time)

In [None]:
train_data.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 15))
lgb.plot_importance(model, max_num_features=50, ax=ax, importance_type='gain') # 'gaiのn'他に'split'がある。
#use = test[test['installation_id']=='00abaee7']
#use[use['type']=='Assessment']

In [None]:
# process test set, the same that was done with the train set
#installation_n = train['installation_id'].nunique()
#train_agr = pd.DataFrame()
#for ins_id, user_sample in tqdm(train.groupby('installation_id',sort=False),
#                                total=installation_n):
#    train_agr = train_agr.append(agr_session(user_sample))
#user_sample = train[train['installation_id']=='0006a69f']
#train_agr = agr_session(user_sample)

In [None]:
# process test set, the same that was done with the train set
#test_agr = pd.DataFrame()
#for ins_id, user_sample in tqdm(test.groupby('installation_id',sort=False),
#                                total=1000):
#    test_agr = test_agr.append(agr_session(user_sample))

In [None]:
#train_data = preprocess_data(user_sample)
#train_data.head()
#installation_n = train_agr['installation_id'].nunique()
#train_data = pd.DataFrame()
#for ins_id, user_sample in tqdm(train_agr.groupby('installation_id',sort=False),
#                                total=installation_n):
#    train_data = train_data.append(preprocess_data(user_sample))

In [None]:
#test_data = pd.DataFrame()
#for ins_id, user_sample in tqdm(test_agr.groupby('installation_id',sort=False),
#                                total=1000):
#    test_data = test_data.append(preprocess_data(user_sample,False))

In [None]:
# Create multiple datasets to create multiple models (not for CV).
#NFOLDS = 5
#folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

In [None]:
# XGBoost
start_time = time()
xgb_models = []
scores = []

params = {
    'max_depth': 9,                 # 6           # mod 10→9
    'learning_rate': 0.01,          # = eta 0.1: [0,1]
    'objective': 'reg:linear',                    # add
    'n_estimators' : 300,           # 100
    'subsample': 0.6,               # 1, (0,1]    # mod 0.8→0.6
    'colsample_bytree': 1.0,        # 1, (0, 1]   # mod 0.8→1.0
    'gamma': 0.0,                                 # add
    'min_child_weight': 5,                        # add
    'seed' : 42,
}

# Train and make models
for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
    print('● Fold :', fold+1,'/',NFOLDS)
    dtrain = xgb.DMatrix(X.iloc[train_ids], y[train_ids])
    dval = xgb.DMatrix(X.iloc[val_ids], y[val_ids])
    model = xgb.train(params=params,
                      dtrain=dtrain,
                      num_boost_round=5000,
                      evals=[(dtrain, 'train'), (dval, 'val')],
                      early_stopping_rounds=100,
                      verbose_eval=100
                     )
    xgb_models.append(model)
    
print('Time:', time() - start_time)

In [None]:
# makes the model and set the parameters
def make_CatBoost(task_type):
    model = CatBoostRegressor(
        iterations=5000,
        learning_rate=0.02,
        loss_function='RMSE',
        random_seed=42,
        depth=10,                            # add
        border_count=108,                    # add
        bagging_temperature=2.348502,        # add
        task_type=task_type,
        early_stopping_rounds=200
    )
    return model

In [None]:
# CatBoost
start_time = time()
cat_models = []
scores = []

# Train and make models
for fold, (train_ids, test_ids) in enumerate(folds.split(X, y)):
    print('● Fold :', fold+1,'/',NFOLDS)
    model = make_CatBoost(task_type)
    #model.fit(X.loc[train_ids, all_features], y.loc[train_ids], 
    #          eval_set=(X.loc[test_ids, all_features], y.loc[test_ids]),
    model.fit(X.loc[train_ids, use_col], y.loc[train_ids], 
              eval_set=(X.loc[test_ids, use_col], y.loc[test_ids]),
              use_best_model=False,
              verbose=500,
              cat_features=categorical_features)    
    cat_models.append(model)
    
print('Time:', time() - start_time)

In [None]:
preds = []

# CatBoost models
for model in cat_models:
    pred = model.predict(X)
    preds.append(pred)
    
# XGBoost models
for model in xgb_models:
    pred = model.predict(xgb.DMatrix(X))
    pred = pred.flatten()
    preds.append(pred)
    
# LightGBM models
for model in lgb_models:
    pred = model.predict(X,num_iteration=model.best_iteration)
    pred = pred.reshape(len(X),1).flatten()
    preds.append(pred)

df = pd.DataFrame(preds).T
df.columns = ['C1','C2','C3','C4','C5',   # CatBoost
              'X1','X2','X3','X4','X5',   # XGBoost
              'L1','L2','L3','L4','L5']   # LightGBM

In [None]:
# Calculate the average value of each model pred
df['mean'] = df.mean(axis = 'columns')
df.head(10)

In [None]:
class OptRounder(object):
    def __init__(self):
        self.res_ = []
        self.coef_ = []
        
    def get_res(self):
        return self.res_
    
    # objective function
    def func(self, coef, X, y):
        kappa = cohen_kappa_score(self.bincut(coef, X), y,
                                  weights='quadratic')
        return -kappa

    def bincut(self, coef, X):
        return pd.cut(X,
                      [-np.inf] + list(np.sort(coef)) + [np.inf],
                      labels = [0, 1, 2, 3])
        
    def fit(self, X, y):
        pfunc = partial(self.func, X=X, y=y)
        self.res_ = sp.optimize.minimize(fun = pfunc,           # objective func
                                         x0 = [0.6, 1.5, 2.4],  # initial coef
                                         method='nelder-mead')  # solver
        self.coef_ = self.res_.x
        
    def predict(self, X, coef):
        return self.bincut(coef, X)

In [None]:
optR = OptRounder()
optR.fit(df['mean'].values.reshape(-1,), y)
res = optR.get_res()        # Optimized result

print('●Iterations performed\t:',res.nit)
print('●Optimized coefficients\t:',res.x)
print('●Cohen Kappa score\t:',-res.fun)

coefficients = res.x        # Optimized coefficients

In [None]:
# final classification
df['predict'] = optR.predict(df['mean'].values, coefficients).astype(int)

df['y'] = y
df[['mean','predict','y']].head(10)

In [None]:
df[['mean','predict','y']].plot(subplots=True,layout=(1, 3),
                                figsize=(11, 3),kind='hist')

In [None]:
# binning plot of 'pred' versus 'y'
df.plot.hexbin(x='y', y='predict', gridsize=(3,3),
               sharex=False, title = "binning 'pred' vs 'y'")

In [None]:
preds = []
for model in cat_models:        # CatBoost
    pred = model.predict(X_test)
    preds.append(pred)
for model in xgb_models:        # XGBoost
    pred = model.predict(xgb.DMatrix(X_test))
    pred = pred.flatten()
    preds.append(pred)
for model in lgb_models:        # LightGBM
    pred = model.predict(X_test,num_iteration=model.best_iteration)
    pred = pred.reshape(len(X_test),1).flatten()
    preds.append(pred)
df_s = pd.DataFrame(preds).T

df_s['mean'] = df_s.mean(axis = 'columns')

# Classification
df_s['pred'] = optR.predict(df_s['mean'].values, coefficients).astype(int)

print(df_s.shape)
df_s[['mean','pred']].head(10)

In [None]:
df_s[['mean','pred']].plot(subplots=True, layout=(1, 2),
                           figsize=(7, 3), kind='hist')

In [None]:
submission = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))
submission['accuracy_group'] = df_s['pred']
submission.head(10)

In [None]:
submission.to_csv('submission.csv', index=None)

## Compile data
Based on several kernels
- Hosseinali: https://www.kaggle.com/mhviraf/a-new-baseline-for-dsb-2019-catboost-model
- Bruno Aquino: https://www.kaggle.com/braquino/catboost-some-more-features

In [None]:
# Convert the raw data into processed features
#def get_data(user_sample, test_set=False):
#    '''
#    user_sample : DataFrame from train/test group by 'installation_id'
#    test_set    : related with the labels processing
#    '''
    # Constants and parameters declaration
#    user_assessments = []
#    last_type = 0
#    types_count = {'Clip':0, 'Activity':0, 'Assessment':0, 'Game':0}
#    time_first_activity = float(user_sample['timestamp'].values[0])
#    time_spent_each_title = {title:0 for title in title_list}
#    event_code_count = {code:0 for code in event_code_list}
#    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    
#    accumu_accuracy_group = 0
#    accumu_accuracy=0
#    accumu_win_n = 0 
#    accumu_loss_n = 0 
#    accumu_actions = 0
#    counter = 0
#    durations = []
    
    # group by 'game_session'
#    for i, session in user_sample.groupby('game_session', sort=False):
        # i      : game_session_id
        # session: DataFrame from user_sample group by 'game_session'
#        session_type = session['type'].iloc[0]  # Game/Assessment/Activity/Clip
#        session_title = session['title'].iloc[0]
        
#        if session_type != 'Assessment':
#            time_spent = int(session['game_time'].iloc[-1] / 1000)   # [sec]
#            time_spent_each_title[num2title[session_title]] += time_spent
        
#        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100(4110)
#            all_4100 = session.query(f'event_code == \
#                                         {title2win_code[session_title]}')
            # numbers of wins and losses
#            win_n = all_4100['event_data'].str.contains('true').sum()
#            loss_n = all_4100['event_data'].str.contains('false').sum()

            # init features and then update
#            features = types_count.copy()
#            features.update(time_spent_each_title.copy())
#            features.update(event_code_count.copy())
#            features['session_title'] = session_title
#            features['accumu_win_n'] = accumu_win_n
#            features['accumu_loss_n'] = accumu_loss_n
#            accumu_win_n += win_n
#            accumu_loss_n += loss_n
            
#            features['day_of_the_week'] = (session['timestamp'].iloc[-1]). \
#                                            strftime('%A')    # Mod 2019-11-17

#            if durations == []:
#                features['duration_mean'] = 0
#            else:
#                features['duration_mean'] = np.mean(durations)
#            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)

            # average of the all accuracy of this player
#            features['accuracy_ave'] = accumu_accuracy / counter \
#                                                if counter > 0 else 0
#            accuracy = win_n / (win_n + loss_n) \
#                                   if (win_n + loss_n) > 0 else 0
#            accumu_accuracy += accuracy
#            if accuracy == 0:
#                features['accuracy_group'] = 0
#            elif accuracy == 1:
#                features['accuracy_group'] = 3
#            elif accuracy == 0.5:
#                features['accuracy_group'] = 2
#            else:
#                features['accuracy_group'] = 1
#            features.update(accuracy_groups)
#            accuracy_groups[features['accuracy_group']] += 1
            # average of accuracy_groups of this player
#            features['accuracy_group_ave'] = \
#                    accumu_accuracy_group / counter if counter > 0 else 0
#            accumu_accuracy_group += features['accuracy_group']
            
            # how many actions the player has done in this game_session
#            features['accumu_actions'] = accumu_actions
            
            # if test_set, all sessions belong to the final dataset
            # elif train, needs to be passed throught this clausule
#            if test_set or (win_n + loss_n) > 0:
#                user_assessments.append(features)
                
#            counter += 1
        
        # how many actions was made in each event_code
#        event_codes = Counter(session['event_code'])
#        for key in event_codes.keys():
#            event_code_count[key] += event_codes[key]

        # how many actions the player has done
#        accumu_actions += len(session)
#        if last_type != session_type:
#            types_count[session_type] += 1
#            last_type = session_type
            
    # if test_set, only the last assessment must be predicted,
    # the previous are scraped
#    if test_set:
#        return user_assessments[-1]
#    return user_assessments

## Step 1 : Create Regressor Models
Create multiple train_datasets using `kFold` and create a regression model from each dataset. I used ** CatBoost **, ** XGBoost **, ** LightGBM **.

### - CatBoost

### -XGBoost

## Step 2 : Predict each Model

## Step 3 : Optimize Rounding Coefficients
The rounding coefficient is optimized using the average value of the prediction results of each model. Optimization uses `scipy.optimize.minimize()`.

## Step 4 : Final Classification

In [None]:
# final classification
#df['predict'] = optR.predict(df['mean'].values, coefficients).astype(int)

#df['y'] = y
#df[['mean','predict','y']].head(10)

In [None]:
#df[['mean','predict','y']].plot(subplots=True,layout=(1, 3),
#                                figsize=(11, 3),kind='hist')

In [None]:
# binning plot of 'pred' versus 'y'
#df.plot.hexbin(x='y', y='predict', gridsize=(3,3),
#               sharex=False, title = "binning 'pred' vs 'y'")

## Make submission

In [None]:
#preds = []
#for model in cat_models:        # CatBoost
#    pred = model.predict(X_test)
#    preds.append(pred)
#for model in xgb_models:        # XGBoost
#    pred = model.predict(xgb.DMatrix(X_test))
#    pred = pred.flatten()
#    preds.append(pred)
#for model in lgb_models:        # LightGBM
#    pred = model.predict(X_test,num_iteration=model.best_iteration)
#    pred = pred.reshape(len(X_test),1).flatten()
#    preds.append(pred)
#df_s = pd.DataFrame(preds).T

#df_s['mean'] = df_s.mean(axis = 'columns')

# Classification
#df_s['pred'] = optR.predict(df_s['mean'].values, coefficients).astype(int)

#print(df_s.shape)
#df_s[['mean','pred']].head(10)

In [None]:
#df_s[['mean','pred']].plot(subplots=True, layout=(1, 2),
#                           figsize=(7, 3), kind='hist')

In [None]:
#submission = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))
#submission['accuracy_group'] = df_s['pred']
#submission.head(10)

In [None]:
#submission.to_csv('submission.csv', index=None)