- Adversarial validation based on **DSB2019_XGB_c08-01**

# Library

In [None]:
import numpy as np
import random
import math
import pandas as pd
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 10000)
pd.set_option("display.max_colwidth", 200)

from numba import jit

import copy
import gc
gc.collect()

from contextlib import contextmanager
import time
from datetime import datetime, timedelta, timezone
JST = timezone(timedelta(hours=+9), 'JST')
notebookstart = time.time()
from timeit import default_timer as timer

import matplotlib.pyplot as plt
import seaborn as sns 
import itertools
import pickle, gzip
import glob

import lightgbm as lgb
import xgboost as xgb

import category_encoders as ce
import json
import scipy as sp
from functools import partial
from sklearn import metrics
from sklearn.metrics import confusion_matrix, cohen_kappa_score


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
SEED = 100
FOLDS = 5
N_FOLD = 0

train_dir = '/kaggle/input/data-science-bowl-2019/train.csv'
train_labels_dir = '/kaggle/input/data-science-bowl-2019/train_labels.csv'
specs_dir = '/kaggle/input/data-science-bowl-2019/specs.csv'
test_dir = '/kaggle/input/data-science-bowl-2019/test.csv'
sample_submission_dir = '/kaggle/input/data-science-bowl-2019/sample_submission.csv'

out_dir = '/kaggle/working'

DEBUG = False

# Functions: Helper

In [None]:
def seed_torch(s):
    os.environ['PYTHONHASHSEED'] = str(s)
    np.random.seed(s)
#     torch.manual_seed(s)
#     torch.cuda.manual_seed(s)
#     torch.backends.cudnn.benchmark = False
#     torch.backends.cudnn.deterministic = True
seed_torch(SEED)

In [None]:
def to_pickle(obj, filename):
    with open(filename, 'wb') as f:
        pickle.dump(obj , f)
        
def read_pickle(filename):
    with open(filename, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [None]:
def high_corr(df, th):
    a = df.corr(method='pearson')
    a = a*np.tri(len(a), k=-1)
    b = a[abs(a)>=th]
    display(b.loc[b.notnull().any(axis=1), b.notnull().any(axis=0)])

# Functions: Data import & general processing

In [None]:
MAX_HISTORY_GROUP = 0

def data_prep(is_train=True):
    print(f'\n{"*"*30}\n{"*"*10} Prepare X {"*"*10}\n{"*"*30}')
    ##### Import & Cast dtypes #####
    df = pd.read_csv(train_dir if is_train else test_dir)
    print(f'length:\n{len(df):,}')
    
    df['event_code'] = df['event_code'].astype(str)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    print(f'dtypes:\n{df.dtypes}')
    display(df.head())
    

    if is_train:
        ##### Import labels #####
        train_labels_df = pd.read_csv(train_labels_dir)
        session_has_labels = list(set(train_labels_df['game_session'].values))
        print(f'Length of train_labels:\n\t{len(train_labels_df):,}')
        print(f'Unique game_session in train_labels:\n\t{len(session_has_labels):,}')
        
        ##### Extract the final assessment & Delete after the final assessment #####
        max_timestamp = df[
            (df['event_count']==1) & (df['game_session'].isin(session_has_labels))
                ].groupby('installation_id')[['timestamp']].max().reset_index()
        max_timestamp.columns = ['installation_id', 'max_timestamp']
        df['flg_non_future'] = pd.merge(df[['installation_id']], max_timestamp, on='installation_id', how='left')['max_timestamp']
        
        print(f'All rows:\n\t{len(df):,}')
        df = df[(df['flg_non_future'].notnull()) & (df['timestamp']<=df['flg_non_future'])]
        print(f'After exclude future rows:\n\t{len(df):,}')
        
    else:
        df['flg_non_future'] = 0
        
    ##### Sort records along with "installation_id" & "timestamp" #####
    df.sort_values(['installation_id','timestamp'], inplace=True)
    
    
    ##### Create "history_group" #####
    if is_train:
        df['history_group'] = 0
        df.loc[(df['event_count']==1) & (df['game_session'].isin(session_has_labels)), ['history_group']] = 1

        df.sort_values(['installation_id','timestamp'], inplace=True, ascending=[True,False])
        df['history_group'] = df.groupby(['installation_id'])['history_group'].cumsum()
        
    else:
        df['history_group'] = 1
        

    df.sort_values(['installation_id','timestamp'], inplace=True)
    display(df.head())
    global MAX_HISTORY_GROUP
    MAX_HISTORY_GROUP = df["history_group"].max()
    print(f'Max assessment in history:\t{MAX_HISTORY_GROUP}')
    
    if is_train:
        print(f'\n{"*"*30}\n{"*"*10} Prepare y {"*"*10}\n{"*"*30}')
        y = pd.merge(
            train_labels_df[['installation_id','game_session','accuracy_group']],
            df.loc[(df['event_count']==1), ['installation_id','game_session','history_group']].drop_duplicates(),
            on=['installation_id','game_session'],
            how='left')
        y = y[['installation_id','history_group','accuracy_group']]

        y['sample_weight'] = y.groupby('installation_id').transform('count')['history_group']
        y['sample_weight'] = 1/y['sample_weight']
        y = y.set_index(['installation_id','history_group']).sort_index()

        display(y.head())
        print(f'Shape of y:\n\t{y.shape}')
        
        return df, y
    else:
        return df

- Create fold infomation

In [None]:
def create_INS_ID_LIST_TRAIN(df, seed):
    INS_ID_LIST_TRAIN = df['installation_id'].unique()
    INS_ID_LIST_TRAIN.sort()
    random.Random(seed).shuffle(INS_ID_LIST_TRAIN)
    return INS_ID_LIST_TRAIN

# Prepare Train

In [None]:
%%time
train_df, y = data_prep(is_train=True)

# Prep features

In [None]:
# def import_FE(FE_dir_c, pkl_name, score_df_01dir, score_df_02dir, cutoff_01, cutoff_02):
#     drop_col_list = []
    
#     scores_df = pd.read_csv(score_df_01dir+'/scores_df.csv')
#     drop_col_list_ni01 = scores_df.loc[int(cutoff_01*1.2):, 'feature'].tolist()

#     scores_df = pd.read_csv(score_df_02dir+'/scores_df.csv')
#     drop_col_list_ni02 = scores_df.loc[int(cutoff_02):, 'feature'].tolist()

#     drop_col_list = list(set(drop_col_list+drop_col_list_ni01+drop_col_list_ni02))
    
#     FE_train = read_pickle(FE_dir_c+'/'+pkl_name)
#     FE_train.drop(columns=[c for c in FE_train.columns if c in drop_col_list], inplace=True)
#     return FE_train

## FE-c08

In [None]:
FE_dir_c = '/kaggle/input/dsb2019-fe-c08'
pkl_name = 'FE_XGB_c02-07_train.pkl'


FE = read_pickle(FE_dir_c+'/'+pkl_name)
X = FE

display(X.head())
print(f'X.shape\t:{X.shape}')

## Drop features

In [None]:
def create_drop_col_list(score_df_01dir, type_01='gain>0', score_df_02dir=None, type_02='gain>0', cutoff_01=None, cutoff_02=None):
    drop_col_list = []
    drop_col_list_ni01 = []
    drop_col_list_ni02 = []
    
    scores_df = pd.read_csv(score_df_01dir+'/scores_df.csv')
    if type_01=='cutoff':
        drop_col_list_ni01 = scores_df.loc[int(cutoff_01*1.2):, 'feature'].tolist()
    elif type_01=='gain>0':
        drop_col_list_ni01 = scores_df.loc[scores_df['gain_score']<=0, 'feature'].tolist()

    if score_df_02dir!=None:
        scores_df = pd.read_csv(score_df_02dir+'/scores_df.csv')
        if type_02=='cutoff':
            drop_col_list_ni02 = scores_df.loc[int(cutoff_02):, 'feature'].tolist()
        elif type_02=='gain>0':
            drop_col_list_ni02 = scores_df.loc[scores_df['gain_score']<=0, 'feature'].tolist()
            
    drop_col_list = list(set(drop_col_list+drop_col_list_ni01+drop_col_list_ni02))
        
    return drop_col_list

## FE-c08

In [None]:
score_df_01dir = '/kaggle/input/dsb2019-xgb-fs-c08-ni01-03'
score_df_02dir = '/kaggle/input/dsb2019-xgb-fs-c08-ni02-03'
cutoff_01 = 350
cutoff_02 = 170
type_01 = 'cutoff'
type_02 = 'cutoff'


drop_col_list = create_drop_col_list(score_df_01dir, type_01, score_df_02dir, type_02, cutoff_01, cutoff_02)
X.drop(columns=drop_col_list, inplace=True, errors='ignore')
display(X.head())
print(f'X.shape\t:{X.shape}')

In [None]:
cols_df = pd.DataFrame({'col_name':X.columns.tolist()})

cols_df.to_csv(out_dir+'/cols_df.csv')

# Prepare test

In [None]:
lgb_dir = '/kaggle/input/dsb2019-lgb-c08-01'
xgb_dir = '/kaggle/input/dsb2019-xgb-c08-01'

LGB_FEAT = pd.read_csv(lgb_dir+'/cols_df.csv')['col_name'].tolist()
XGB_FEAT = pd.read_csv(xgb_dir+'/cols_df.csv')['col_name'].tolist()
XGB_FEAT = [c.replace('~','[').replace('|',']') for c in XGB_FEAT]

FEATURE_ORDER = list(set(LGB_FEAT + XGB_FEAT + ['title_from','event_code_from','event_id_from']))

print(f'# of lgb_feat:\t{len(LGB_FEAT)}')
print(f'# of xgb_feat:\t{len(XGB_FEAT)}')
print(f'# of FEATURE_ORDER:\t{len(FEATURE_ORDER)}')

In [None]:
def xgb_ohe(df):
    cols_ohe = [c for c in df.columns if df[c].dtype.name in ['object','category']]
    ce_ohe = ce.OneHotEncoder(cols=cols_ohe, handle_unknown='error', use_cat_names=True)
    out_df_tmp = ce_ohe.fit_transform(df[cols_ohe])
    out_df_tmp.sort_index(axis=1, inplace=True)
    df.drop(columns=cols_ohe, inplace=True)
    df = pd.concat([df, out_df_tmp], axis=1)
    df.columns = [c.replace('[','~').replace(']','|') for c in df.columns]
    return df

In [None]:
def create_ohe(df, cols, n_chank=100):
    start = time.time()
    out_df = pd.DataFrame()
    ins_id = []
    ins_id_list = df['installation_id'].unique().tolist()
    
    for i, i_id in enumerate(ins_id_list):
        print('\r',end='',flush=True)
        print(f'{i+1} / {len(ins_id_list)}  {(time.time()-start)/60:.1f} mins', end='',flush=True)
        ins_id.append(i_id)
        
        if ((i+1)%n_chank!=0)&((i+1)!=len(ins_id_list)):
            continue

        df_tmp = df[df['installation_id'].isin(ins_id)]
        ce_ohe = ce.OneHotEncoder(cols=cols, handle_unknown='error', use_cat_names=True)
        out_df_tmp = ce_ohe.fit_transform(df_tmp[['installation_id']+cols])
        out_df_tmp = out_df_tmp.groupby(['installation_id']).agg([np.mean,np.sum]) # DSB2019_FE_02 -> mean&sum
        created_cols = [c[0]+'_'+c[1] for c in out_df_tmp.columns]
        out_df_tmp.reset_index(inplace=True)
        out_df_tmp.columns = ['installation_id']+created_cols
        out_df_tmp.set_index('installation_id', inplace=True)
        out_df = pd.concat([out_df, out_df_tmp], axis=0)
        
        ins_id = []
    
    out_df = out_df.reset_index().set_index('installation_id').sort_index()
    return out_df
def create_last(df):
    out_df = df.groupby(['installation_id'])[['timestamp']].max().reset_index()
    out_df = pd.merge(out_df, df, on=['installation_id','timestamp'], how='left')
    out_df.set_index('installation_id', inplace=True)
    return out_df
def create_assess(df):
    out_df = df.loc[(df['type']=='Assessment')&(df['title']!='Bird Measurer (Assessment)')&(df['event_code']=='4100')|
                    (df['type']=='Assessment')&(df['title']=='Bird Measurer (Assessment)')&(df['event_code']=='4110'),
                   ['installation_id','game_session','event_data']]
    
    out_df['num_correct']   = out_df['event_data'].str.contains('true')*1
    out_df['num_incorrect'] = out_df['event_data'].str.contains('false')*1
    
    out_df = out_df.groupby(['installation_id','game_session'])[['num_correct','num_incorrect']].sum().reset_index()
    out_df['accuracy'] = out_df['num_correct']/(out_df['num_correct']+out_df['num_incorrect'])
    
    out_df['accuracy_group'] = 0
    out_df.loc[(out_df['accuracy']==1.0), ['accuracy_group']] = 3
    out_df.loc[(out_df['accuracy']==0.5), ['accuracy_group']] = 2
    out_df.loc[(out_df['accuracy']>0.0)&(out_df['accuracy']<0.5), ['accuracy_group']] = 1
    
    out_df = out_df.groupby(['installation_id'])[['num_correct','num_incorrect','accuracy','accuracy_group']].agg([np.mean]) # .agg([np.sum,np.mean])
    created_cols = [c[0]+'_'+c[1] for c in out_df.columns]
    out_df.reset_index(inplace=True)
    out_df.columns = ['installation_id']+created_cols
    out_df.set_index('installation_id', inplace=True)
    return out_df
def create_assess_match_last(df):
    last_assess = df[df['type']=='Assessment'].groupby(['installation_id'])['timestamp'].max().reset_index()
    last_assess = pd.merge(last_assess, df[['installation_id','timestamp','title']], 
                           on=['installation_id','timestamp'], how='left')[['installation_id','title']]
    last_assess['match_last'] = 1
    
    out_df = df.loc[(df['type']=='Assessment')&(df['title']!='Bird Measurer (Assessment)')&(df['event_code']=='4100')|
                    (df['type']=='Assessment')&(df['title']=='Bird Measurer (Assessment)')&(df['event_code']=='4110'),
                   ['installation_id','game_session','event_data','title']]
    
    out_df = pd.merge(out_df, last_assess, on=['installation_id','title'], how='left')
    out_df = out_df[out_df['match_last']==1]
    
    out_df['num_correct_match_last']   = out_df['event_data'].str.contains('true')*1
    out_df['num_incorrect_match_last'] = out_df['event_data'].str.contains('false')*1
    
    out_df = out_df.groupby(['installation_id','game_session'])[['num_correct_match_last','num_incorrect_match_last']].sum().reset_index()
    out_df['accuracy_match_last'] = out_df['num_correct_match_last']/(out_df['num_correct_match_last']+out_df['num_incorrect_match_last'])
    
    out_df = out_df.groupby(['installation_id'])[['num_correct_match_last','num_incorrect_match_last','accuracy_match_last']].agg([np.mean]) # .agg([np.sum,np.mean])
    created_cols = [c[0]+'_'+c[1] for c in out_df.columns]
    out_df.reset_index(inplace=True)
    out_df.columns = ['installation_id']+created_cols
    out_df.set_index('installation_id', inplace=True)
    return out_df
def create_diff_timestamp(df):
    wo_clip = df['type']!='Clip'
    out_df = df[wo_clip].groupby(['installation_id','game_session'])[['timestamp']].diff().fillna(0)
    out_df['timestamp'] = out_df['timestamp'].dt.seconds
    
    out_df = pd.concat([df.loc[wo_clip, ['installation_id','world','type','title','event_code','event_id','game_session']], out_df], axis=1)
    out_df = out_df.groupby(['installation_id','world','type','title','event_code','event_id','game_session'])[['timestamp']].agg([np.sum])
    
    out_df.columns = ['timespend']
    out_df.reset_index(inplace=True)
    return out_df
def create_timespend(diff_FE, col_comb):
    v = 'timespend'
    out_df = pd.DataFrame()
    for c_comb in col_comb:
        tmp = pd.pivot_table(diff_FE, index=['installation_id'], columns=c_comb, values=v, aggfunc=[np.mean,np.sum]) # [np.mean,np.sum])
        cols = []
        for c in tmp.columns:
            col = v
            for i in range(len(c)):
                col += '_'+c[-(i+1)]
            cols += [col]
        tmp.columns = cols
        out_df = pd.concat([out_df, tmp], axis=1)
    return out_df
def create_timepast(df, n_from, n_to):
    game_number_df = df.reset_index()
    game_number_df['game_number'] = game_number_df.groupby('installation_id')['index'].rank(ascending=False)
    
    game_number_df.drop(columns=['index'], inplace=True)
    
    out_df = game_number_df[game_number_df['game_number']==n_to]
    out_df.columns = [c+'_to' if c!='installation_id' else c for c in out_df.columns]
    
    from_df = game_number_df[game_number_df['game_number']==n_from]
    from_df.columns = [c+'_from' if c!='installation_id' else c for c in from_df.columns]
    
    out_df = pd.merge(out_df, from_df, on='installation_id', how='left')
        
    out_df.set_index('installation_id', inplace=True)
    return out_df
def create_json(df, n_chank=100, session_wise=False):
    start = time.time()
    out_df = pd.DataFrame()
    ins_id = []
    ins_id_list = df['installation_id'].unique().tolist()
    
    for i, i_id in enumerate(ins_id_list):
        print(f'{i+1} / {len(ins_id_list)}  {(time.time()-start)/60:.1f} mins\r', end='',flush=True)
        ins_id.append(i_id)
        
        if ((i+1)%n_chank!=0)&((i+1)!=len(ins_id_list)):
            continue
        
        df_tmp = df[df['installation_id'].isin(ins_id)].reset_index(drop=True)
        out_df_tmp = pd.io.json.json_normalize(df_tmp['event_data'].apply(json.loads))
        out_df_tmp = pd.concat([df_tmp[['installation_id','game_session']], out_df_tmp], axis=1)
        ##### Turn on when session-wise #####
        if session_wise:
            out_df_tmp = out_df_tmp.groupby(['installation_id','game_session']).agg([np.sum])
            out_df_tmp.columns = [c[0] for c in out_df_tmp.columns]
            out_df_tmp.reset_index(inplace=True)
        #####
        
        out_df_tmp = out_df_tmp.groupby(['installation_id']).agg([np.mean,np.sum])
        created_cols = [c[0]+'_'+c[1] for c in out_df_tmp.columns]
        out_df_tmp.reset_index(inplace=True)
        out_df_tmp.columns = ['installation_id']+created_cols
        out_df_tmp.set_index('installation_id', inplace=True)
        out_df = pd.concat([out_df, out_df_tmp], axis=0)
        
        ins_id = []
    
    out_df = out_df.reset_index().set_index('installation_id').sort_index()
    return out_df
def create_json_ohe(df, n_chank=100, session_wise=False):
    start = time.time()
    out_df = pd.DataFrame(index=[], columns=['installation_id'])
    out_df.set_index(['installation_id'], inplace=True)
    ins_id = []
    ins_id_list = df['installation_id'].unique().tolist()
    
    OHE_cols = [
        "object_type",
        "layout.row1",
        "chests",
        "gate.side",
        "layout.row2",
        "target_distances",
        "diet",
        "holding_shell",
        "layout.right.pig",
        "object",
        "toy_earned",
        "source",
        "mode",
        "round_target.type",
        "location",
        "has_water",
        "shells",
        "dinosaur",
        "jar_filled",
        "bowls",
    ]
    
    
    for i, i_id in enumerate(ins_id_list):
        print(f'\r{i+1} / {len(ins_id_list)}  {(time.time()-start)/60:.1f} mins', end='',flush=True)
        ins_id.append(i_id)
        
        if ((i+1)%n_chank!=0)&((i+1)!=len(ins_id_list)):
            continue

        df_tmp = df.loc[df['installation_id'].isin(ins_id), ['installation_id','game_session','event_data']].reset_index(drop=True)
        out_df_tmp = pd.io.json.json_normalize(df_tmp['event_data'].apply(json.loads))
        
        cols_in_tmp = [c for c in out_df_tmp.columns if c in OHE_cols]
        if len(cols_in_tmp)==0:
            ins_id = []
            continue
            
        out_df_tmp = out_df_tmp[cols_in_tmp]
        out_df_tmp = out_df_tmp.astype(str)
        ce_ohe = ce.OneHotEncoder(cols=out_df_tmp.columns.tolist(), handle_unknown='error', use_cat_names=True)
        out_df_tmp = ce_ohe.fit_transform(out_df_tmp)
        out_df_tmp = pd.concat([df_tmp[['installation_id','game_session']], out_df_tmp], axis=1)
        
        ##### Turn on when session-wise #####
        if session_wise:
            out_df_tmp = out_df_tmp.groupby(['installation_id','game_session']).agg([np.sum])
            out_df_tmp.columns = [c[0] for c in out_df_tmp.columns]
            out_df_tmp.reset_index(inplace=True)
        #####
        
        out_df_tmp = out_df_tmp.groupby(['installation_id']).agg([np.mean,np.sum])
        created_cols = [c[0]+'_'+c[1] for c in out_df_tmp.columns]
        out_df_tmp.reset_index(inplace=True)
        out_df_tmp.columns = ['installation_id']+created_cols
        out_df_tmp.set_index('installation_id', inplace=True)
        out_df = pd.concat([out_df, out_df_tmp], axis=0)
        
        ins_id = []
    
    out_df = out_df.reset_index().set_index('installation_id').sort_index()
    return out_df
def create_ohe_session(df, cols, n_chank=100):
    start = time.time()
    out_df = pd.DataFrame()
    ins_id = []
    ins_id_list = df['installation_id'].unique().tolist()
    
    for i, i_id in enumerate(ins_id_list):
        print('\r',end='',flush=True)
        print(f'{i+1} / {len(ins_id_list)}  {(time.time()-start)/60:.1f} mins', end='',flush=True)
        ins_id.append(i_id)
        
        if ((i+1)%n_chank!=0)&((i+1)!=len(ins_id_list)):
            continue

        df_tmp = df.loc[df['installation_id'].isin(ins_id), ['installation_id','game_session']+cols]
        df_tmp.drop_duplicates(subset=['installation_id','game_session']+cols, inplace=True)
        ce_ohe = ce.OneHotEncoder(cols=cols, handle_unknown='error', use_cat_names=True)
        out_df_tmp = ce_ohe.fit_transform(df_tmp[['installation_id','game_session']+cols])
        out_df_tmp = out_df_tmp.groupby(['installation_id']).agg([np.mean,np.sum]) # DSB2019_FE_02 -> mean&sum
        created_cols = [c[0]+'_'+c[1] for c in out_df_tmp.columns]
        out_df_tmp.reset_index(inplace=True)
        out_df_tmp.columns = ['installation_id']+created_cols
        out_df_tmp.set_index('installation_id', inplace=True)
        out_df = pd.concat([out_df, out_df_tmp], axis=0)
        
        ins_id = []
    
    out_df = out_df.reset_index().set_index('installation_id').sort_index()
    return out_df
def create_ohe_session_mean(df, cols, n_chank=100):
    start = time.time()
    out_df = pd.DataFrame()
    ins_id = []
    ins_id_list = df['installation_id'].unique().tolist()
    
    for i, i_id in enumerate(ins_id_list):
        print('\r',end='',flush=True)
        print(f'{i+1} / {len(ins_id_list)}  {(time.time()-start)/60:.1f} mins', end='',flush=True)
        ins_id.append(i_id)
        
        if ((i+1)%n_chank!=0)&((i+1)!=len(ins_id_list)):
            continue

        df_tmp = df.loc[df['installation_id'].isin(ins_id), ['installation_id','game_session']+cols]
        df_tmp.drop_duplicates(subset=['installation_id','game_session']+cols, inplace=True)
        ce_ohe = ce.OneHotEncoder(cols=cols, handle_unknown='error', use_cat_names=True)
        out_df_tmp = ce_ohe.fit_transform(df_tmp[['installation_id','game_session']+cols])
        out_df_tmp = out_df_tmp.groupby(['installation_id','game_session']).agg([np.sum])
        out_df_tmp.columns = [c[0]+'_'+c[1] for c in out_df_tmp.columns]
        out_df_tmp.reset_index(inplace=True)
        
        out_df_tmp = out_df_tmp.groupby(['installation_id']).agg([np.mean])
        created_cols = [c[0]+'_'+c[1] for c in out_df_tmp.columns]
        out_df_tmp.reset_index(inplace=True)
        
        out_df_tmp.columns = ['installation_id']+created_cols
        out_df_tmp.set_index('installation_id', inplace=True)
        out_df = pd.concat([out_df, out_df_tmp], axis=0)
        
        ins_id = []
    
    out_df = out_df.reset_index().set_index('installation_id').sort_index()
    return out_df


def hst_ctg_FE(df):
    cols = ['type','title','event_code']
    ctg_FE = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_ohe(tmp, cols, n_chank=100)

        col_drop = [c for c in tmp.columns if ' Level ' in c]
        tmp.drop(columns=col_drop, inplace=True)

        tmp['history_group'] = i+1
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        ctg_FE = pd.concat([ctg_FE, tmp], axis=0)

        del tmp; gc.collect();

    ctg_FE = ctg_FE.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(ctg_FE.head())
    print(f'ctg_FE.shape:\t{ctg_FE.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return ctg_FE
def hst_ctg_FE_03(df):
    df['title_event_code'] = df['title']+'_'+df['event_code']
    
    cols = ['title_event_code']
    ctg_FE = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_ohe(tmp, cols, n_chank=100)

        col_drop = [c for c in tmp.columns if ' Level ' in c]
        tmp.drop(columns=col_drop, inplace=True)

        tmp['history_group'] = i+1
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        ctg_FE = pd.concat([ctg_FE, tmp], axis=0)

        del tmp; gc.collect();

    ctg_FE = ctg_FE.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(ctg_FE.head())
    print(f'ctg_FE.shape:\t{ctg_FE.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return ctg_FE
def hst_ctg_FE_04(df):
    cols = ['event_id']
    ctg_FE = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_ohe(tmp, cols, n_chank=100)

        col_drop = [c for c in tmp.columns if ' Level ' in c]
        tmp.drop(columns=col_drop, inplace=True)

        tmp['history_group'] = i+1
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        ctg_FE = pd.concat([ctg_FE, tmp], axis=0)

        del tmp; gc.collect();

    ctg_FE = ctg_FE.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(ctg_FE.head())
    print(f'ctg_FE.shape:\t{ctg_FE.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return ctg_FE
def hst_last_FE(df):
    last_FE = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_last(tmp)[['title','timestamp']]
        tmp['history_group'] = i+1
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        last_FE = pd.concat([last_FE, tmp], axis=0)

        del tmp; gc.collect();

    # last_FE_train['hour'] = (last_FE_train['timestamp'].dt.hour).astype(str)
    # last_FE_train['daysofweek'] = (last_FE_train['timestamp'].dt.dayofweek).astype(str)

    last_FE = last_FE.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(last_FE.head())
    print(f'last_FE.shape:\t{last_FE.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return last_FE
def hst_assess_FE(df):
    assess_FE = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_assess(tmp)
        tmp['history_group'] = i+1
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        assess_FE = pd.concat([assess_FE, tmp], axis=0)

        del tmp; gc.collect();

    assess_FE = assess_FE.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(assess_FE.head())
    print(f'assess_FE.shape:\t{assess_FE.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return assess_FE
def hst_assess_match_last_FE(df):
    assess_match_last_FE = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_assess_match_last(tmp)
        tmp['history_group'] = i+1
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        assess_match_last_FE = pd.concat([assess_match_last_FE, tmp], axis=0)

        del tmp; gc.collect();

    assess_match_last_FE = assess_match_last_FE.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(assess_match_last_FE.head())
    print(f'assess_match_last_FE.shape:\t{assess_match_last_FE.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return assess_match_last_FE
def hst_diff_FE(df):
    col_comb = [
        ['type'],
        ['title'],
        ['world','type'],
    ]

    diff_FE_1 = pd.DataFrame()
    diff_FE_2 = pd.DataFrame()
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp0 = create_diff_timestamp(tmp)
        tmp1 = create_timespend(tmp0, col_comb)

        tmp2 = tmp0.groupby('installation_id')[['timespend']].sum()
        tmp2.columns = ['timespend_total']

        tmp1['history_group'] = i+1
        tmp2['history_group'] = i+1
        tmp1.drop(columns=[c for c in tmp1.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        tmp2.drop(columns=[c for c in tmp2.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        diff_FE_1 = pd.concat([diff_FE_1, tmp1], axis=0)
        diff_FE_2 = pd.concat([diff_FE_2, tmp2], axis=0)

        del tmp; gc.collect();

    diff_FE_1 = diff_FE_1.reset_index().set_index(['installation_id','history_group']).sort_index()
    diff_FE_2 = diff_FE_2.reset_index().set_index(['installation_id','history_group']).sort_index()

    display(diff_FE_1.head())
    display(diff_FE_2.head())
    print(f'diff_FE_1.shape:\t{diff_FE_1.shape}')
    print(f'diff_FE_2.shape:\t{diff_FE_2.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return diff_FE_1, diff_FE_2
def hst_timepast_1(df):
    cols = ['world','type','title']
    timepast_1 = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_timepast(tmp,2,1)
        for c in cols:
            tmp[c+'_match_from_to'] = (tmp[c+'_to']==tmp[c+'_from'])*1

        tmp['history_group'] = i+1
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        timepast_1 = pd.concat([timepast_1, tmp], axis=0)

        del tmp; gc.collect();

    timepast_1 = timepast_1.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(timepast_1.head())
    print(f'timepast_1.shape:\t{timepast_1.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return timepast_1
def hst_json(df, session_wise=False, col_surfix='_00'):
    json_FE = pd.DataFrame()

    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_json(tmp, n_chank=100, session_wise=session_wise)

        col_drop = [c for c in tmp.columns if ' Level ' in c]
        tmp.drop(columns=col_drop, inplace=True)

        tmp['history_group'] = i+1
        tmp.columns = [c+col_surfix if c not in ['installation_id','history_group'] else c for c in tmp.columns]
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        json_FE = pd.concat([json_FE, tmp], axis=0)

        del tmp; gc.collect();

    json_FE = json_FE.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(json_FE.head())
    print(f'json_FE.shape:\t{json_FE.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return json_FE
def hst_json_ohe(df, session_wise=True, col_surfix='_00'):
    json_ohe = pd.DataFrame()

    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_json_ohe(tmp, n_chank=100, session_wise=session_wise)

        col_drop = [c for c in tmp.columns if ' Level ' in c]
        tmp.drop(columns=col_drop, inplace=True)

        tmp['history_group'] = i+1
        tmp.columns = [c+col_surfix if c not in ['installation_id','history_group'] else c for c in tmp.columns]
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        json_ohe = pd.concat([json_ohe, tmp], axis=0)

        del tmp; gc.collect();

    json_ohe = json_ohe.reset_index().set_index(['installation_id','history_group']).sort_index()

    display(json_ohe.head())
    print(f'json_ohe.shape:\t{json_ohe.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return json_ohe
def hst_ohe_sw(df, col_surfix='_00'):
    cols = ['type','title']
    ctg_FE_session = pd.DataFrame()
    
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_ohe_session(tmp, cols, n_chank=100)

        col_drop = [c for c in tmp.columns if ' Level ' in c]
        tmp.drop(columns=col_drop, inplace=True)

        tmp['history_group'] = i+1
        tmp.columns = [c+col_surfix if c not in ['installation_id','history_group'] else c for c in tmp.columns]
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        ctg_FE_session = pd.concat([ctg_FE_session, tmp], axis=0)

        del tmp; gc.collect();

    ctg_FE_session = ctg_FE_session.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(ctg_FE_session.head())
    print(f'ctg_FE_session.shape:\t{ctg_FE_session.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return ctg_FE_session
def hst_ohe_sw_mean(df, col_surfix='_00'):
    cols = ['event_code']

    ctg_FE_session_mean = pd.DataFrame()
    for i in range(MAX_HISTORY_GROUP):
        tmp = df[df['history_group']>=(i+1)].drop(columns=['history_group'])

        tmp = create_ohe_session_mean(tmp, cols, n_chank=100)

        col_drop = [c for c in tmp.columns if ' Level ' in c]
        tmp.drop(columns=col_drop, inplace=True)

        tmp['history_group'] = i+1
        tmp.columns = [c+col_surfix if c not in ['installation_id','history_group'] else c for c in tmp.columns]
        tmp.drop(columns=[c for c in tmp.columns if c not in FEATURE_ORDER+['installation_id','history_group']], inplace=True)
        ctg_FE_session_mean = pd.concat([ctg_FE_session_mean, tmp], axis=0)

        del tmp; gc.collect();

    ctg_FE_session_mean = ctg_FE_session_mean.reset_index().set_index(['installation_id','history_group']).sort_index()
    display(ctg_FE_session_mean.head())
    print(f'ctg_FE_session_mean.shape:\t{ctg_FE_session_mean.shape}')
    print(f'***** History group {MAX_HISTORY_GROUP}: Done *****')
    return ctg_FE_session_mean

In [None]:
%%time
test_df = data_prep(is_train=False)

In [None]:
if DEBUG:
    ids_list = list(set(test_df['installation_id'].tolist()))
    print(f'All test ids (before):\t{len(ids_list)}')
    
    test_df = test_df[test_df['installation_id'].isin(ids_list[:50])]
    ids_list = list(set(test_df['installation_id'].tolist()))
    print(f'All test ids (after):\t{len(ids_list)}')

In [None]:
%%time
ctg_FE_test                    = hst_ctg_FE(test_df)
last_FE_test                   = hst_last_FE(test_df)
assess_FE_test                 = hst_assess_FE(test_df)
assess_match_last_FE_test      = hst_assess_match_last_FE(test_df)
diff_FE_1_test, diff_FE_2_test = hst_diff_FE(test_df)
timepast_1_test                = hst_timepast_1(test_df)
ctg_FE_03_test                 = hst_ctg_FE_03(test_df)
ctg_FE_04_test                 = hst_ctg_FE_04(test_df)

##### FE-c05 #####
json_FE_00_test                = hst_json(test_df, session_wise=True, col_surfix='_00')
json_FE_01_test                = hst_json(test_df, session_wise=False, col_surfix='_01')

##### FE-c06 #####
json_ohe_00_test               = hst_json_ohe(test_df, session_wise=True, col_surfix='_00')
json_ohe_01_test               = hst_json_ohe(test_df, session_wise=False, col_surfix='_01')

##### FE-c07 #####
ctg_FE_sw_test                 = hst_ohe_sw(test_df, col_surfix='_sw_00')
ctg_FE_sw_mean_test            = hst_ohe_sw_mean(test_df, col_surfix='_sw_01')

In [None]:
Test = pd.concat([ctg_FE_test.fillna(0), last_FE_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, assess_FE_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, assess_match_last_FE_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, diff_FE_1_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, diff_FE_2_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, timepast_1_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, ctg_FE_03_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, ctg_FE_04_test], axis=1, levels=['installation_id','history_group'])

##### FE-c05 #####
Test = pd.concat([Test, json_FE_00_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, json_FE_01_test], axis=1, levels=['installation_id','history_group'])

##### FE-c06 #####
Test = pd.concat([Test, json_ohe_00_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, json_ohe_01_test], axis=1, levels=['installation_id','history_group'])

##### FE-c07 #####
Test = pd.concat([Test, ctg_FE_sw_test], axis=1, levels=['installation_id','history_group'])
Test = pd.concat([Test, ctg_FE_sw_mean_test], axis=1, levels=['installation_id','history_group'])


Test.sort_index(inplace=True)

cat_list = [c for c in Test.columns if Test[c].dtype.name in ['object','category']]
for c in cat_list:
    Test[c] = Test[c].astype('category')

##### XGB
Test = xgb_ohe(Test)
    
to_pickle(Test, out_dir+'/Test_all.pkl')

In [None]:
# Test = read_pickle('/kaggle/input/dsb2019-ens-01/Test_all.pkl')
# Test = Test[X.columns]
# print(f'Test.shape\t:{Test.shape}')

In [None]:
print(f'Original y:\t{y.shape}')

y_X = pd.DataFrame(np.zeros(len(X)), columns=['accuracy_group'], index=X.index)
y_T = pd.DataFrame(np.ones(len(Test)), columns=['accuracy_group'], index=Test.index)
y = pd.concat([y_X, y_T], axis=0)
y['sample_weight'] = 1

display(y.head())
print(f'Adversarial y:\t{y.shape}')


X = pd.concat([X, Test], axis=0)

for c in X.columns:
    if X[c].dtype.name in ['object','category']:
        X[c] = X[c].astype('category')
    
print(f'Adversarial X:\t{X.shape}')

In [None]:
to_pickle(X, '/X.pkl')
to_pickle(y, '/y.pkl')

# Training

In [None]:
class OptimizedRounder(object):
    def __init__(self, method):
        self.coef_ = 0
        self.method = method
        
    def _kappa_loss(self, coef, X, y, w=None):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            else:
                X_p[i] = 3

        ll = metrics.cohen_kappa_score(y, X_p, weights='quadratic', sample_weight=w)
        return -ll

    def fit(self, X, y, w=None):
        loss_partial = partial(self._kappa_loss, X=X, y=y, w=w)
        initial_coef = VALID_TH
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method=self.method)
        
    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            else:
                X_p[i] = 3
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [None]:
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def eval_qwk_lgb(y_true, y_pred):
    """
    Fast kappa eval function for lgb.
    """

    y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'kappa', qwk(y_true, y_pred), True

def eval_qwk_lgb_regr(y_true_ori, y_pred_ori, w=None):
    """
    Fast kappa eval function for lgb.
    """
    try:
        y_true = y_pred_ori.get_label()
        y_pred = y_true_ori.copy()
    except:
        y_true = y_true_ori.copy()
        y_pred = y_pred_ori.copy()
    
    y_pred[y_pred <= VALID_TH[0]] = 0
    y_pred[np.where(np.logical_and(y_pred > VALID_TH[0], y_pred <= VALID_TH[1]))] = 1
    y_pred[np.where(np.logical_and(y_pred > VALID_TH[1], y_pred <= VALID_TH[2]))] = 2
    y_pred[y_pred > VALID_TH[2]] = 3
    
    return 'kappa', cohen_kappa_score(y_true, y_pred, weights='quadratic', sample_weight=w), True # return 'kappa', qwk(y_true, y_pred), True

def rmse(preds, data):
    y_true = data.get_label()
    metric = (((preds-y_true)**2/len(preds)).sum())**0.5
    return 'rmse', metric, False

def lgb_metrics(preds, data):
    return [
        eval_qwk_lgb_regr(preds, data),
        rmse(preds, data),
    ]

In [None]:
class eval_qwk_lgb_regr_weight(object):
    def __init__(self, trn_w=None, val_w=None):
        self.trn_w = trn_w
        self.val_w = val_w
        
    def __call__(self, y_true_ori, y_pred_ori):
        """
        Fast kappa eval function for lgb.
        """
        try:
            y_true = y_pred_ori.get_label()
            y_pred = y_true_ori.copy()
        except:
            y_true = y_true_ori.copy()
            y_pred = y_pred_ori.copy()

#         ##### For log target #####
#         y_true = np.exp(y_true)-1
#         y_pred = np.exp(y_pred)-1
#         ##########################
            
        y_pred[y_pred <= VALID_TH[0]] = 0
        y_pred[np.where(np.logical_and(y_pred > VALID_TH[0], y_pred <= VALID_TH[1]))] = 1
        y_pred[np.where(np.logical_and(y_pred > VALID_TH[1], y_pred <= VALID_TH[2]))] = 2
        y_pred[y_pred > VALID_TH[2]] = 3
        
        if len(y_true_ori)==len(self.trn_w):
            w = self.trn_w
        else:
            w = self.val_w

        return 'kappa', cohen_kappa_score(y_true, y_pred, weights='quadratic', sample_weight=w), True

In [None]:
def save_importances(importances_, figsize=(8, 12)):
    mean_gain = importances_[['gain', 'feature']].groupby('feature').mean()
    importances_['mean_gain'] = importances_['feature'].map(mean_gain['gain'])
    plt.figure(figsize=figsize)
    sns.barplot(x='gain', y='feature', data=importances_.sort_values('mean_gain', ascending=False))
    plt.tight_layout()
#     plt.savefig('importances.png')

In [None]:
def save_iter_scores(iter_scores, figsize=(8, 12)):
    plt.figure(figsize=figsize)
    sns.lineplot(x="iteration", y="score", hue='fold', style="trn_val", data=iter_scores, palette=sns.color_palette("muted")[:FOLDS]);
    plt.tight_layout()
    plt.show();
#     plt.savefig('importances.png')

In [None]:
def plot_metric(self):
    """
    Plot training progress.
    Inspired by `plot_metric` from https://lightgbm.readthedocs.io/en/latest/_modules/lightgbm/plotting.html

    :return:
    """
    full_evals_results = pd.DataFrame()
    for model in self.models:
        evals_result = pd.DataFrame()
        for k in model.model.evals_result_.keys():
            evals_result[k] = model.model.evals_result_[k][self.eval_metric]
        evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'})
        full_evals_results = full_evals_results.append(evals_result)

    full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric,
                                                                                        'variable': 'dataset'})
    sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset')
    plt.title('Training progress')

In [None]:
# class ReduceLearningRateCallback(object):
#     def __init__(self, monitor_metric, reduce_every=100, ratio=0.5):
#         self._monitor_metric = monitor_metric
#         self._reduce_every = reduce_every
#         self._ratio = ratio
#         self._best_score = None
#         self._counter = 0

#     def _is_higher_score(self, metric_score, is_higher_better):
#         if self._best_score is None:
#             return True

#         if is_higher_better:
#             return self._best_score < metric_score
#         else:
#             return self._best_score > metric_score

#     def __call__(self, env):
#         evals = env.evaluation_result_list
#         lr = env.params['learning_rate']
#         for tra_val, name, score, is_higher_better in evals:
# #         for pp in evals:
# #             print(pp)
#             # チェックするメトリックを選別する
#             if (env.iteration<100):
#                 return 
            
#             if (name!=self._monitor_metric) | (tra_val!='valid'):
#                 continue
#             # 対象のメトリックが見つかっても過去のスコアよりも性能が悪ければ何もしない
#             if not self._is_higher_score(score, is_higher_better):
#                 self.counter += 1
# #                 print(f'\r{self.counter}', end='', flush=True)
#                 if self.counter==self._reduce_every:
#                     new_parameters = {'learning_rate':lr*self._ratio}
#                     env.model.reset_parameter(new_parameters)
#                     env.params.update(new_parameters)
#                     print(f'[{env.iteration}]\tReduce learning rate {lr} -> {env.params["learning_rate"]}')
#                     self.counter = 0
#                     return
#                 else:
#                     return
#             else:
#                 self._best_score = score
# #                 print(f'\r{self._best_score}', end='', flush=True)
#                 self.counter = 0
#                 return
            
#         raise ValueError('monitoring metric not found')

In [None]:
VALID_TH = [1.1, 1.6, 2.2]

def train_cv(params, shuffle=False, random_state=0, drop_cols=None, th_optimize=False):
#     oof_train = pd.Series(np.zeros(len(X), dtype=float), index=X.index, name='accuracy_group')
    oof_valid = pd.Series(np.zeros(len(X), dtype=float), index=X.index, name='accuracy_group')

    len_df = len(INS_ID_LIST_TRAIN)
    seq_len_df = np.arange(len_df)
    len_each = math.ceil(len_df/FOLDS)

    trn_scores = []
    val_scores = []
    best_iter = []
    clfs = []
    importances = pd.DataFrame()
    iter_scores = pd.DataFrame()

    for f in range(FOLDS):
        print(f'{"*"*5} Fold {f} {"*"*5}')
        evals_result    = {}
        counter         = 0
        params['seed'] += 1
        
        print(f'\r                                                               \r', end='',flush=True)
        ### Split ##########
        s, e = f*(len_each), (f+1)*len_each

        trn_ = ~((seq_len_df>=s)&(seq_len_df<e))
        val_ =  ((seq_len_df>=s)&(seq_len_df<e))

        trn_x, trn_y = X.loc[INS_ID_LIST_TRAIN[trn_], :], y.loc[INS_ID_LIST_TRAIN[trn_], :]
        val_x, val_y = X.loc[INS_ID_LIST_TRAIN[val_], :], y.loc[INS_ID_LIST_TRAIN[val_], :]

        if drop_cols!=None:
            trn_x.drop(columns=drop_cols, inplace=True)
            val_x.drop(columns=drop_cols, inplace=True)

        if shuffle:
            trn_y['accuracy_group'] = trn_y.sample(frac=1, random_state=random_state).values
            val_y['accuracy_group'] = val_y.sample(frac=1, random_state=random_state).values
        #####################
        
#         ##### For log target #####
#         trn_y[['accuracy_group']] = np.log(trn_y[['accuracy_group']]+1)
#         val_y[['accuracy_group']] = np.log(val_y[['accuracy_group']]+1)
#         ##########################
        
#         ##### LightGBM dataset #####
#         dtrain = lgb.Dataset(data=trn_x, label=trn_y[['accuracy_group']], weight=trn_y['sample_weight'], free_raw_data=False, silent=True)
#         dvalid = lgb.Dataset(data=val_x, label=val_y[['accuracy_group']], weight=val_y['sample_weight'], free_raw_data=False, silent=True, reference=dtrain)
        ##### XGB dataset
        def xgb_ohe(df):
            cols_ohe = [c for c in df.columns if df[c].dtype.name=='category']
            ce_ohe = ce.OneHotEncoder(cols=cols_ohe, handle_unknown='error', use_cat_names=True)
            out_df_tmp = ce_ohe.fit_transform(df[cols_ohe])
            out_df_tmp.sort_index(axis=1, inplace=True)
            df.drop(columns=cols_ohe, inplace=True)
            df = pd.concat([df, out_df_tmp], axis=1)
            return df
        trn_x = xgb_ohe(trn_x)
        val_x = xgb_ohe(val_x)
        dtrain = xgb.DMatrix(data=trn_x, label=trn_y[['accuracy_group']], weight=trn_y['sample_weight'], silent=True)
        dvalid = xgb.DMatrix(data=val_x, label=val_y[['accuracy_group']], weight=val_y['sample_weight'], silent=True)
        
        ##### Custom metric with weight #####
        eqlrw = eval_qwk_lgb_regr_weight(trn_y['sample_weight'].values, val_y['sample_weight'].values)
        def lgb_metrics_weight(preds, data):
            return [eqlrw(preds, data), rmse(preds, data),]
        #####################################

        print(f'Shape of trn_x:\t{trn_x.shape}')
        
#         reducelr_cb = ReduceLearningRateCallback(monitor_metric='kappa', reduce_every=40, ratio=0.5)
#         callbacks = [reducelr_cb]
    
        params['seed'] +=1
#         clf = lgb.train(
#             params                =params,
#             train_set             =dtrain,
#             valid_sets            =[dvalid, dtrain],
#             valid_names           =['valid', 'train'],
# #             feval                 =lgb_metrics_weight,
#             init_model            =None,
#             early_stopping_rounds =params['early_stopping_rounds'],
#             evals_result          =evals_result,
#             verbose_eval          =params['verbose'],
#             keep_training_booster =True,
# #             callbacks             =callbacks,
#         )
        ##### XGB train
        clf = xgb.train(
            params                =params,
            dtrain                =dtrain,
            num_boost_round       =params['n_estimators'],
            evals                 =[(dtrain,'train'),(dvalid,'valid')],
#             feval                 =lgb_metrics_weight,
            maximize              =True,
            xgb_model             =None,
            early_stopping_rounds =params['early_stopping_rounds'],
            evals_result          =evals_result,
            verbose_eval          =params['verbose'],
        )
#         clf.save_model(out_dir+'/LGB_'+'{:02}_'.format(params['seed_avg_times'])+'{:02}'.format(f)+'.txt')
        
        clfs.append(clf)
        best_iter.append(clf.best_iteration)

#         ##### LGBM
#         oof_valid_tmp = clf.predict(dvalid.data)
        ##### XGB
        oof_valid_tmp = clf.predict(dvalid)
        
#         ##### For log target #####
#         oof_valid_tmp = np.exp(oof_valid_tmp)-1
#         ##########################
        
        oof_valid.loc[INS_ID_LIST_TRAIN[[val_]], :] = oof_valid_tmp

        t = np.array(evals_result['train']['auc'])
        v = np.array(evals_result['valid']['auc'])
        trn_scores.append(t[v.argmax()])
        val_scores.append(v.max())

#         ##### LGBM
#         imp_df = pd.DataFrame({
#                 'feature': trn_x.columns,
#                 'gain': clf.feature_importance(importance_type='gain'),
#                 'fold': [f] * len(trn_x.columns),
#                 })
        ##### XGB
        imp_dict = clf.get_score(fmap='', importance_type='total_gain')
        imp_df = pd.DataFrame({
                'feature': [i[0] for i in imp_dict.items()],
                'gain': [i[1] for i in imp_dict.items()],
                'fold': [f] * len(imp_dict),
                })
        imp_df = pd.merge(pd.DataFrame({'feature': trn_x.columns.tolist()}), imp_df, how='left', on='feature').fillna(0)
        
        importances = pd.concat([importances, imp_df], axis=0, sort=False).reset_index(drop=True)

        iter_scores_tmp = pd.DataFrame()
        for k in ['valid', 'train']:
            tmp = pd.DataFrame()
#             tmp['loss'] = evals_result[k]['rmse']
            tmp['score'] = evals_result[k]['auc']
            tmp['trn_val'] = k
            iter_scores_tmp = pd.concat([iter_scores_tmp, tmp], axis=0)
        iter_scores_tmp = iter_scores_tmp.reset_index().rename(columns={'index': 'iteration'})
        iter_scores_tmp['fold'] = f
        iter_scores_tmp['id'] = 'trn_val_'+str(f)
        iter_scores = pd.concat([iter_scores, iter_scores_tmp], axis=0).reset_index(drop=True)
        print(f'\n')

    print(f'\n{"-"*10}Train{"-"*10}')
    for i in np.arange(len(trn_scores)):
        print(f'\tFold {i}:\t{trn_scores[i]:.5f}')
    print(f'Average train:\t{np.average(trn_scores):.5f}')
    print(f'\n')

    print(f'\n{"-"*10}Validation{"-"*10}')
    for i in np.arange(len(val_scores)):
        print(f'\tFold {i}:\t{val_scores[i]:.5f}')
    score_valid = eval_qwk_lgb_regr(y[['accuracy_group']].values, oof_valid.values, y['sample_weight'])[1].item()
    print(f'Average valid:\t{np.average(val_scores):.5f}')
    print(f'Score valid:\t{score_valid:.5f}')
#     print(f'Threshold while training:\t{VALID_TH}')

    optR, oof_opt_valid = _, _
    if th_optimize:
#         print(f'\n{"-"*10}Optimized with validation{"-"*10}')
        optR = OptimizedRounder('nelder-mead') # 'Powell'
        optR.fit(oof_valid.values, y['accuracy_group'].values, y['sample_weight'].values)
        oof_opt_valid = optR.predict(oof_valid.values, optR.coefficients())
        score_valid = cohen_kappa_score(y['accuracy_group'], oof_opt_valid, weights='quadratic', sample_weight=y['sample_weight'])
#         print(f'Optimized threshold:\t{[round(coef,3) for coef in optR.coefficients().tolist()]}')
#         print(f'Score valid:\t\t{score_valid:.5f}')
#         to_pickle(optR, 'optR_'+'{:02}_'.format(params['seed_avg_times'])+'{:.5f}'.format(score_valid)+'.pkl')

    print(f'params\n{params}')
    return clfs, iter_scores, optR, importances, (trn_scores), (val_scores, score_valid, X.index, oof_valid, oof_opt_valid)

# Finetuned training

In [None]:
def create_INS_ID_LIST_TRAIN(df, seed):
    df = df.reset_index()
    INS_ID_LIST_TRAIN = df['installation_id'].unique()
    INS_ID_LIST_TRAIN.sort()
    random.Random(seed).shuffle(INS_ID_LIST_TRAIN)
    return INS_ID_LIST_TRAIN

In [None]:
def augmentation(_x, _y, _seed):
    _x.reset_index(inplace=True)
    _x = _x.sample(frac=1, random_state=_seed)
    _x.drop_duplicates(subset='installation_id', inplace=True)
    _x.set_index(['installation_id','history_group'], inplace=True)
    _y = _y.loc[_x.index, :]
    return _x, _y

In [None]:
drop_cols = []

# params = {
#     'n_estimators': 2000,
#     'boosting_type': 'gbdt',
#     'objective': 'binary', # 'regression',
#     'metric': 'auc', # 'kappa',
#     'first_metric_only': True,
#     'subsample': 1.0,
#     'subsample_freq': 1,
#     'learning_rate': 0.01,
#     'feature_fraction': 0.8,
#     'max_depth': -1,
#     'lambda_l1': 1,  
#     'lambda_l2': 5,
#     'min_data_in_leaf': 15,
# #     'min_child_weight': 0.1,
#     'verbose': -1,
#     'early_stopping_rounds': 100,
# #     'importance_type': 'gain', 
# #     'eval_metric': 'kappa',
#     'seed': SEED,
#     'seed_avg_times':0,
# }
##### XGB param #####
params = {
    'n_estimators': 2000,
    'objective': 'binary:logistic', # 'reg:squarederror', 
    'eval_metric': 'auc', # 'rmse',
    'learning_rate': 0.01,
    'max_depth': 10, 
    'subsample': 1.0, 
    'colsample_bytree': 0.8,
    'alpha': 1, # 'lambda_l1'
    'lambda': 5, # 'lambda_l2'
    'verbose': 10000,
    'early_stopping_rounds': 100,
#     'eval_metric': ['rmse','kappa'],
    'seed': SEED, 
    'silent': True,
    'nthread': 4,
}

SEED_AVG_TIMES = 5
DROP_DUP_TIMES = 5

actual_importances = pd.DataFrame()

start = time.time()
for j in range(DROP_DUP_TIMES):
    for i in range(SEED_AVG_TIMES):
        ### Augmentation ####
        X = read_pickle('/X.pkl')
        y = read_pickle('/y.pkl')
        
        X.drop(columns=[c for c in X.columns if c not in [c.replace('[','~').replace(']','|') for c in XGB_FEAT]+['installation_id','history_group']], inplace=True)
        
        X, y = augmentation(X, y, SEED+i+j*SEED_AVG_TIMES)
        #####################
        
        print(f'{"*"*30}\n{"*"*10} Trial {i} {"*"*10}\n{"*"*30}')

        INS_ID_LIST_TRAIN = create_INS_ID_LIST_TRAIN(X, SEED+i+j*SEED_AVG_TIMES)
        params['seed_avg_times'] = i
        clfs, iter_scores, optR, importances, trn_tup, val_tup = train_cv(params, shuffle=False, random_state=SEED, drop_cols=drop_cols, th_optimize=True)

        importances['trial'] = i
        actual_importances = pd.concat([actual_importances, importances], axis=0, sort=False).reset_index(drop=True)

        print(f'{"*"*5} Time:\t{(time.time()-start)/60:.1f} mins {"*"*5}')
        print(f'\n')
    

actual_importances.to_csv(out_dir+'/actual_importances.csv', index=False)

In [None]:
# save_iter_scores(pd.concat(sa_iter_scores, axis=0).groupby(['iteration','trn_val','fold','id']).mean().reset_index(), figsize=(10, 6))

In [None]:
# save_importances(importances_=pd.concat(sa_importances, axis=0), figsize=(10,45))

# Shuffle importance

In [None]:
drop_cols = []

# params = {
#     'n_estimators': 2000,
#     'boosting_type': 'gbdt',
#     'objective': 'binary', # 'regression',
#     'metric': 'auc', # 'kappa',
#     'first_metric_only': True,
#     'subsample': 1.0,
#     'subsample_freq': 1,
#     'learning_rate': 0.01,
#     'feature_fraction': 0.8,
#     'max_depth': -1,
#     'lambda_l1': 1,  
#     'lambda_l2': 5,
#     'min_data_in_leaf': 15,
# #     'min_child_weight': 0.1,
#     'verbose': -1,
#     'early_stopping_rounds': 100,
# #     'importance_type': 'gain', 
# #     'eval_metric': 'kappa',
#     'seed': SEED,
#     'seed_avg_times':0,
# }
##### XGB param #####
params = {
    'n_estimators': 2000,
    'objective': 'binary:logistic', # 'reg:squarederror', 
    'eval_metric': 'auc', # 'rmse',
    'learning_rate': 0.01,
    'max_depth': 10, 
    'subsample': 1.0, 
    'colsample_bytree': 0.8,
    'alpha': 1, # 'lambda_l1'
    'lambda': 5, # 'lambda_l2'
    'verbose': 10000,
    'early_stopping_rounds': 100,
#     'eval_metric': ['rmse','kappa'],
    'seed': SEED, 
    'silent': True,
    'nthread': 4,
}

SEED_AVG_TIMES = 5
DROP_DUP_TIMES = 5

null_importances = pd.DataFrame()

start = time.time()
for j in range(DROP_DUP_TIMES):
    for i in range(SEED_AVG_TIMES):
        ### Augmentation ####
        X = read_pickle('/X.pkl')
        y = read_pickle('/y.pkl')
        
        X.drop(columns=[c for c in X.columns if c not in [c.replace('[','~').replace(']','|') for c in XGB_FEAT]+['installation_id','history_group']], inplace=True)
        
        X, y = augmentation(X, y, SEED+i+j*SEED_AVG_TIMES)
        #####################
        
        print(f'{"*"*30}\n{"*"*10} Trial {i} {"*"*10}\n{"*"*30}')

        INS_ID_LIST_TRAIN = create_INS_ID_LIST_TRAIN(X, SEED+i+j*SEED_AVG_TIMES)
        params['seed_avg_times'] = i
        clfs, iter_scores, optR, importances, trn_tup, val_tup = train_cv(params, shuffle=True, random_state=SEED, drop_cols=drop_cols, th_optimize=True)

        importances['trial'] = i
        null_importances = pd.concat([null_importances, importances], axis=0, sort=False).reset_index(drop=True)

        print(f'{"*"*5} Time:\t{(time.time()-start)/60:.1f} mins {"*"*5}')
        print(f'\n')
    

null_importances.to_csv(out_dir+'/null_importances.csv', index=False)

# Compare Actual vs Null

- Calculate $log\frac{1+Actual}{1+Null}$

In [None]:
feature_scores = []

for _f in actual_importances['feature'].unique():
#     f_null_imps_gain =   null_importances.loc[  null_importances['feature']==_f, 'gain'].mean()
#     f_act_imps_gain  = actual_importances.loc[actual_importances['feature']==_f, 'gain'].mean()
    f_null_imps_gain =   np.median(null_importances.loc[  null_importances['feature']==_f, 'gain'].values)
    f_act_imps_gain  = np.median(actual_importances.loc[actual_importances['feature']==_f, 'gain'].values)
    
    gain_score = np.log(1 + f_act_imps_gain / (1 + f_null_imps_gain))
    
    feature_scores.append((_f, gain_score))

scores_df = pd.DataFrame(feature_scores, columns=['feature', 'gain_score']).sort_values('gain_score', ascending=False)
scores_df.to_csv(out_dir+'/scores_df.csv', index=False)

fig, ax = plt.subplots(1, 1, figsize=(10,120))
plt.tight_layout()
sns.barplot(x='gain_score', y='feature', data=scores_df, ax=ax)
ax.set_title('Feature scores wrt gain importances', fontweight='bold', fontsize=14)
plt.show();

In [None]:
def chk_null_imp(feature, ax):    
    sns.distplot(  null_importances.loc[  null_importances['feature']==feature, 'gain'], kde=False, norm_hist=True, label='Null importance',   ax=ax, color='blue')
    for _idx, _val in actual_importances.loc[actual_importances['feature']==feature, 'gain'].iteritems():
        ax.axvline(x=_val, color='red', alpha=0.25)
    
    ax.set_title(feature, fontsize=12)
    ax.legend()

In [None]:
ordered_features = scores_df['feature'].unique().tolist()
v, h = 1, 5

for j in range(math.ceil(len(ordered_features)/h)):
    s, e = j*h, (j+1)*h
    fig, ax = plt.subplots(v, h, figsize=(30,2))
    for i, _f in enumerate(ordered_features[s:e]):
        chk_null_imp(_f, ax[i])
#     for k in np.arange(i+1,h):
#         fig.delaxes(ax[k])
    plt.show();
    if j>10:
        break

In [None]:
scores_df.head(40)

In [None]:
def chk_distribution(X_train, X_test, chk_fold=0, figsize=(8,5), chk_cols=None):
    if chk_fold!=None:
        v = 3
    else:
        v = 2
    h = 1
    len_df = len(X_train)
    seq_len_df = np.arange(len_df)
    len_each = math.ceil(len_df/FOLDS)
    
    chk_col = []
    for i, X in enumerate([X_train, X_test]):
        dtype_dict = {c:X[c].dtype.name for c in X.columns}
        col_list = [i for i in dtype_dict.items()]
        num_list = [i[0] for i in np.array(col_list) if i[1] in ('int64', 'float64')]
        chk_col.append(set(num_list))

    if len(chk_col[0]-chk_col[1])!=0:
        raise ValueError("Columns are not equal between train and test")
        
    if chk_cols!=None:
        num_list=chk_cols
        
    for col in num_list:
        fig, ax = plt.subplots(v, h, figsize=figsize, sharex=True, sharey=True)
        fig.subplots_adjust(wspace=0.5,hspace=0.0)

        max_value = max(X_train[col].max(), X_test[col].max())
        min_value = min(X_train[col].min(), X_test[col].min())

        bins = np.linspace(min_value, max_value, 101)

        if chk_fold!=None:
            s, e = chk_fold*(len_each), (chk_fold+1)*len_each

            trn_ = seq_len_df[~((seq_len_df>=s)&(seq_len_df<e))]
            val_ = seq_len_df[ ((seq_len_df>=s)&(seq_len_df<e))]

            for i, idx in enumerate([INS_ID_LIST_TRAIN[[trn_]], INS_ID_LIST_TRAIN[[val_]], INS_ID_LIST_TEST]):
                if i in [0,1]:
                    null_count = X_train.loc[idx, col].isnull().sum()
                    if i == 0: l = 'Train'
                    else: l = 'Valid'
                    sns.distplot(X_train.loc[idx, col].dropna(), bins=bins, kde=False, norm_hist=False, 
                                 label=l+' (NaN: {}, {:.1f}%)'.format(null_count, null_count/len(X_train.loc[idx, col])*100), ax=ax[i], color='blue')
                if i in [2]:
                    null_count = X_test.loc[idx, col].isnull().sum()
                    l = 'Test'
                    sns.distplot(X_test.loc[idx, col].dropna(), bins=bins, kde=False, norm_hist=False, 
                                 label=l+' (NaN: {}, {:.1f}%)'.format(null_count, null_count/len(X_test.loc[idx, col])*100), ax=ax[i], color='orange')

                ax[i].set_yscale('log')
                ax[i].legend()
        else:
            for i in range(2):
                if i in [0]:
                    null_count = X_train.loc[:, col].isnull().sum()
                    l = 'Train'
                    sns.distplot(X_train.loc[:, col].dropna(), bins=bins, kde=False, norm_hist=True, 
                                 label=l+' (NaN: {}, {:.1f}%)'.format(null_count, null_count/len(X_train.loc[:, col])*100), ax=ax[i], color='blue')
                if i in [1]:
                    null_count = X_test.loc[:, col].isnull().sum()
                    l = 'Test'
                    sns.distplot(X_test.loc[:, col].dropna(), bins=bins, kde=False, norm_hist=True, 
                                 label=l+' (NaN: {}, {:.1f}%)'.format(null_count, null_count/len(X_test.loc[:, col])*100), ax=ax[i], color='orange')

                ax[i].set_yscale('log')
                ax[i].legend()

        ax[0].set_title(col, fontsize=16)
        print(f'{col}')
        plt.show()

In [None]:
X_idx = np.array(y[y['accuracy_group']==0].index.tolist())[:,0].tolist()
T_ids = np.array(y[y['accuracy_group']==1].index.tolist())[:,0].tolist()

In [None]:
disp_feats = scores_df.loc[scores_df['gain_score']>=2, 'feature'].tolist()
chk_distribution(X.loc[X_idx, disp_feats], X.loc[T_ids, disp_feats], chk_fold=None, figsize=(8,5), chk_cols=None)