In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc, sys, os, math, random
import datetime
import json
from pandas.io.json import json_normalize

import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')

# pandas display option
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_row', 1500)
pd.set_option('max_colwidth', 150)
pd.set_option('display.float_format', '{:.2f}'.format)
#pd.options.display.float_format = '{:,.3f}'.format

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# feature engineering functions
def get_object_columns(df, columns):
    df = df.groupby(['installation_id', columns])['event_id'].count().reset_index()
    df = df.pivot_table(index='installation_id', columns=[columns], values='event_id')
    df.columns = list(df.columns)
    df.fillna(0, inplace=True)
    return df.reset_index()

# Load data

In [None]:
%%time

train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
test = pd.read_csv('../input/data-science-bowl-2019/test.csv')
train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
#specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv', converters={'args': json.loads})

train = train[train['installation_id'].isin(train_labels['installation_id'].unique())]

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.head()

# Feature engineering

## extract timestamp

In [None]:
for df in [train, test]:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
#     df['_hour'] = df['timestamp'].dt.hour
#     df['_dayofweek'] = df['timestamp'].dt.dayofweek
    df['_is_weekend'] = np.where(((df['timestamp'].dt.day_name()=='Sunday')|(df['timestamp'].dt.day_name()=='Saturday')),1,0)
    df['_phase_of_day'] = np.where(df['timestamp'].dt.hour.isin(range(6,12)),'Morning',np.where(df['timestamp'].dt.hour.isin(range(13,19)),'Evening','Night'))

## pick collect value

In [None]:
for df in [train, test]:
    is_assessment = (
        (df['title'].eq('Bird Measurer (Assessment)') & df['event_code'].eq(4110)) |
        (~df['title'].eq('Bird Measurer (Assessment)') & df['event_code'].eq(4100)) &
        df['type'].eq('Assessment'))
    df['correct'] = 0
    df.loc[is_assessment & df['event_data'].str.contains('"correct":true'), 'correct'] = 1
    df['incorrect'] = 0
    df.loc[is_assessment & df['event_data'].str.contains('"correct":false'), 'incorrect'] = 1

## accuracy group 

In [None]:
def accuracy_group(x):
    '''
    3: the assessment was solved on the first attempt
    2: the assessment was solved on the second attempt
    1: the assessment was solved after 3 or more attempts
    0: the assessment was never solved
    '''
    if x['correct'] == 0:
        return 0
    if x['incorrect'] == 0:
        return 3
    elif x['incorrect'] == 1:
        return 2
    else:
        return 1

train_correct = train.groupby(['installation_id','game_session'])[['correct','incorrect']].sum().reset_index()
train_correct['accuracy_group'] = train_correct.apply(accuracy_group, axis=1)

test_correct = test.groupby(['installation_id','game_session'])[['correct','incorrect']].sum().reset_index()
test_correct['accuracy_group'] = test_correct.apply(accuracy_group, axis=1)

## summary events by session

In [None]:
## drop assessment events
#train = train.query("(type!='Assessment') | (event_count==1)")
#test = test.query("(type!='Assessment') | (event_count==1)")

In [None]:
as_event_id = [4070,4035,4030,4020,3020,3021,3120,3121,2030]
is_event_in = (train['event_code'].isin(as_event_id))
train.loc[is_event_in, 'event_code'] = train[is_event_in]['event_id']
is_event_in = (test['event_code'].isin(as_event_id))
test.loc[is_event_in, 'event_code'] = test[is_event_in]['event_id']

# for df in [train, test]:
#     df['event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), df['title'], df['event_code']))

event_codes = list(train['event_code'].unique())

group_cols = ['installation_id','game_session','type','event_code']
train_session_events = train.groupby(group_cols, sort=False)['event_id'].count().unstack().reset_index().fillna(0)
#train_session_events.loc[train_session_events['type'] == 'Assessment', event_codes] = 0
test_session_events = test.groupby(group_cols, sort=False)['event_id'].count().unstack().reset_index().fillna(0)
for c in [c for c in event_codes if c not in test_session_events.columns]: test_session_events[c] = 0
#test_session_events.loc[test_session_events['type'] == 'Assessment', event_codes] = 0

In [None]:
train_session_events = pd.merge(train_session_events, train_correct)
test_session_events = pd.merge(test_session_events, test_correct)

del train_correct, test_correct

## cumulative by installation

In [None]:
train_session_events['_cumcount_inst_session'] = train_session_events.groupby(['installation_id'], sort=False).cumcount()
test_session_events['_cumcount_inst_session'] = test_session_events.groupby(['installation_id'], sort=False).cumcount()

cumsum_events = train_session_events.groupby(['installation_id'], sort=False)[event_codes].cumsum()
cumsum_events -= train_session_events[event_codes]
train_session_events = train_session_events.drop(event_codes, axis=1).join(cumsum_events)
train_session_events['_cumcount_inst_event'] = train_session_events[event_codes].sum(axis=1)

cumsum_events = test_session_events.groupby(['installation_id'], sort=False)[event_codes].cumsum()
cumsum_events -= test_session_events[event_codes]
test_session_events = test_session_events.drop(event_codes, axis=1).join(cumsum_events)
test_session_events['_cumcount_inst_event'] = test_session_events[event_codes].sum(axis=1)

del cumsum_events

## events + last-data

In [None]:
drop_event_cols = ['event_id','event_data','event_code','correct','incorrect']

train_session_last = train.drop_duplicates(subset=['installation_id','game_session'], keep='last').drop(drop_event_cols, axis=1)
train_session_last['timestamp'] = train.drop_duplicates(subset=['installation_id','game_session'], keep='first')['timestamp'].values
train_sessions = pd.merge(train_session_events, train_session_last, how='left')

test_session_last = test.drop_duplicates(subset=['installation_id','game_session'], keep='last').drop(drop_event_cols, axis=1)
test_session_last['timestamp'] = test.drop_duplicates(subset=['installation_id','game_session'], keep='first')['timestamp'].values
test_sessions = pd.merge(test_session_events, test_session_last, how='left')

del train_session_last, test_session_last
del train_session_events, test_session_events

## cumulative session by installation

In [None]:
for c in ['event_count','game_time','correct','incorrect']:
    grp = train_sessions.groupby(['installation_id'], sort=False)
    train_sessions[f'_cumsum_inst_{c}'] = grp[c].cumsum()
    train_sessions[f'_cumsum_inst_{c}'] -= train_sessions[c]

    grp = test_sessions.groupby(['installation_id'], sort=False)
    test_sessions[f'_cumsum_inst_{c}'] = grp[c].cumsum()
    test_sessions[f'_cumsum_inst_{c}'] -= test_sessions[c]
    
for c in ['event_count','game_time']:
    grp = train_sessions.groupby(['installation_id'], sort=False)
    train_sessions[f'_cummean_inst_{c}'] = train_sessions[f'_cumsum_inst_{c}'] / (grp[c].cumcount() + 1)
    train_sessions[f'_cummax_inst_{c}'] = grp[c].cummax().shift(1)
    train_sessions.loc[train_sessions['_cumcount_inst_session']==0, f'_cummax_inst_{c}'] = np.nan

    grp = test_sessions.groupby(['installation_id'], sort=False)
    test_sessions[f'_cummean_inst_{c}'] = test_sessions[f'_cumsum_inst_{c}'] / (grp[c].cumcount() + 1)
    test_sessions[f'_cummax_inst_{c}'] = grp[c].cummax().shift(1)
    test_sessions.loc[test_sessions['_cumcount_inst_session']==0, f'_cummax_inst_{c}'] = np.nan

train_sessions['_cumsum_attempt'] = train_sessions['_cumsum_inst_correct'] + train_sessions['_cumsum_inst_incorrect']
train_sessions['_cummean_inst_accuracy'] = train_sessions['_cumsum_inst_correct'] / train_sessions['_cumsum_attempt']
test_sessions['_cumsum_attempt'] = test_sessions['_cumsum_inst_correct'] + test_sessions['_cumsum_inst_incorrect']
test_sessions['_cummean_inst_accuracy'] = test_sessions['_cumsum_inst_correct'] / test_sessions['_cumsum_attempt']

## assessment accuracy

In [None]:
train_asmt_sessions = train_sessions[train_sessions['type'] == 'Assessment']
test_asmt_sessions = test_sessions[test_sessions['type'] == 'Assessment']

group_cols = ['installation_id','title']
train_asmt_sessions['_cumcount_inst_title'] = train_asmt_sessions.groupby(group_cols, sort=False)['game_session'].cumcount()
test_asmt_sessions['_cumcount_inst_title'] = test_asmt_sessions.groupby(group_cols, sort=False)['game_session'].cumcount()

In [None]:
## cappa 0.36
# aggrigate accuracy by title
acc_group_pct = train_labels.groupby('title').apply(lambda x: x['accuracy_group'].value_counts(normalize=True)).unstack()
acc_group_pct.columns = [f'_pct_title_acc_group_{i}' for i in range(4)]
acc_group_pct = acc_group_pct.reset_index()

acc_group_pct['_pct_title_correct'] = acc_group_pct[[f'_pct_title_acc_group_{i}' for i in range(1,4)]].sum(axis=1)

train_asmt_sessions = pd.merge(train_asmt_sessions, acc_group_pct, how='left')
test_asmt_sessions = pd.merge(test_asmt_sessions, acc_group_pct, how='left')

In [None]:
train_asmt_sessions['accuracy'] = ((train_asmt_sessions['correct']) / (train_asmt_sessions['correct'] + train_asmt_sessions['incorrect'])).fillna(0)
test_asmt_sessions['accuracy'] = ((test_asmt_sessions['correct']) / (test_asmt_sessions['correct'] + test_asmt_sessions['incorrect'])).fillna(0)

train_asmt_sessions['_prev_asmt_accuracy'] = train_asmt_sessions.groupby(['installation_id'], sort=False)['accuracy'].shift(1)
test_asmt_sessions['_prev_asmt_accuracy'] = test_asmt_sessions.groupby(['installation_id'], sort=False)['accuracy'].shift(1)

train_asmt_sessions['_prev_title_accuracy'] = train_asmt_sessions.groupby(['installation_id','title'], sort=False)['accuracy'].shift(1)
test_asmt_sessions['_prev_title_accuracy'] = test_asmt_sessions.groupby(['installation_id','title'], sort=False)['accuracy'].shift(1)

## accuracy by title

In [None]:
key_cols = ['installation_id', 'game_session']

subset = pd.concat([train_asmt_sessions, test_asmt_sessions])
last_asmt_sessions = test_asmt_sessions.drop_duplicates(subset=['installation_id'], keep='last')[key_cols] # test only
last_asmt_sessions['last_session'] = 1
subset = pd.merge(subset, last_asmt_sessions, on=key_cols, how='left')
subset['last_session'].fillna(0, inplace=True)
subset = subset[subset['last_session'] != 1]
del last_asmt_sessions
subset.shape

In [None]:
# cappa 0.36
aggmap = {
    'accuracy': ['mean','std'],
    'correct': ['mean','std'],
    'incorrect': ['mean','max','std']
}
title_asmt = subset.groupby(['title']).agg(aggmap)
title_asmt.columns = [x[0] if not x[1] else f"_{x[1]}_title_{x[0]}" for x in title_asmt.columns]
title_asmt = title_asmt.reset_index()

train_asmt_sessions = pd.merge(train_asmt_sessions, title_asmt, on=['title'], how='left')
test_asmt_sessions = pd.merge(test_asmt_sessions, title_asmt, on=['title'], how='left')

In [None]:
# cappa 0.38
acc_cumcnt_title_pct = subset.groupby(['title','_cumcount_inst_title']).apply(lambda x: x['accuracy_group'].value_counts(normalize=True)).unstack()
acc_cumcnt_title_pct.columns = [f'_pct_title_acc_group_{i}' for i in range(4)]
acc_cumcnt_title_pct = acc_cumcnt_title_pct.reset_index()

train_asmt_sessions = pd.merge(train_asmt_sessions, acc_cumcnt_title_pct, how='left')
test_asmt_sessions = pd.merge(test_asmt_sessions, acc_cumcnt_title_pct, how='left')

del acc_cumcnt_title_pct

In [None]:
## cappa 0
acc_onehot = pd.get_dummies(train_asmt_sessions['accuracy_group'], prefix='_cumsum_inst_acc_group')
train_inst_acc_onehot = train_asmt_sessions[['installation_id']].join(acc_onehot)
train_asmt_sessions = train_asmt_sessions.join(train_inst_acc_onehot.groupby(['installation_id'], sort=False).cumsum() - acc_onehot)
for c in acc_onehot.columns:
    train_asmt_sessions['_pct' + c] = train_asmt_sessions[c] / train_asmt_sessions[acc_onehot.columns].sum(axis=1)

acc_onehot = pd.get_dummies(test_asmt_sessions['accuracy_group'], prefix='_cumsum_inst_acc_group')
test_inst_acc_onehot = test_asmt_sessions[['installation_id']].join(acc_onehot)
test_asmt_sessions = test_asmt_sessions.join(test_inst_acc_onehot.groupby(['installation_id'], sort=False).cumsum() - acc_onehot)
for c in acc_onehot.columns:
    test_asmt_sessions['_pct' + c] = test_asmt_sessions[c] / test_asmt_sessions[acc_onehot.columns].sum(axis=1)

## time durations

In [None]:
# ## cappa 0.42
train_asmt_sessions['_diff_asmt_seconds'] = train_asmt_sessions.groupby(['installation_id'], sort=False)['timestamp'].diff().dt.seconds
test_asmt_sessions['_diff_asmt_seconds'] = test_asmt_sessions.groupby(['installation_id'], sort=False)['timestamp'].diff().dt.seconds

time_diff = pd.Series(train_sessions.groupby(['installation_id'], sort=False)['timestamp'].diff().dt.seconds, name='_diff_session_seconds')
train_time_diff = train_sessions[['installation_id','game_session']].join(time_diff)
train_asmt_sessions = pd.merge(train_asmt_sessions, train_time_diff, how='left')

time_diff = pd.Series(test_sessions.groupby(['installation_id'], sort=False)['timestamp'].diff().dt.seconds, name='_diff_session_seconds')
test_time_diff = test_sessions[['installation_id','game_session']].join(time_diff)
test_asmt_sessions = pd.merge(test_asmt_sessions, test_time_diff, how='left')

In [None]:
train['_diff_game_time'] = train.groupby(['installation_id','game_session'], sort=False)['game_time'].diff()
train.loc[train['type'] == 'Assessment', '_diff_game_time'] = 0
train['_isnot_diff'] = (train['_diff_game_time'] != 0).astype(int)

test['_diff_game_time'] = test.groupby(['installation_id','game_session'], sort=False)['game_time'].diff()
test.loc[test['type'] == 'Assessment', '_diff_game_time'] = 0
test['_isnot_diff'] = (test['_diff_game_time'] != 0).astype(int)

In [None]:
## cappa 0.44
col_name = '_cummean_diff_event_seconds'

time_cummean = train.groupby(['installation_id'], sort=False)['_diff_game_time'].cumsum()
time_cummean = (time_cummean - train['_diff_game_time']) / train.groupby(['installation_id'], sort=False)['_isnot_diff'].cumsum()
subset = train[['installation_id','game_session']].join(pd.Series(time_cummean,name=col_name)).drop_duplicates(subset=['installation_id','game_session'], keep='last')
train_asmt_sessions = pd.merge(train_asmt_sessions, subset, how='left')

time_cummean = test.groupby(['installation_id'], sort=False)['_diff_game_time'].cumsum()
time_cummean = (time_cummean - test['_diff_game_time']) / test.groupby(['installation_id'], sort=False)['_isnot_diff'].cumsum()
subset = test[['installation_id','game_session']].join(pd.Series(time_cummean,name=col_name)).drop_duplicates(subset=['installation_id','game_session'], keep='last')
test_asmt_sessions = pd.merge(test_asmt_sessions, subset, how='left')

del time_cummean

In [None]:
del train, test, df

## aggregate by installation

In [None]:
## cappa 0.41 + 0.04 = 0.45

group_cols = ['installation_id','game_session','type']

subset = train_sessions.groupby(group_cols, sort=False)['game_session'].count().unstack().fillna(0)
subset = subset.groupby(['installation_id'], sort=False).cumsum()
subset.columns = [f"_cumcount_inst_type_{x}" for x in subset.columns]
train_inst_cumsum = subset.reset_index()

subset = test_sessions.groupby(group_cols, sort=False)['game_session'].count().unstack().fillna(0)
subset = subset.groupby(['installation_id'], sort=False).cumsum()
subset.columns = [f"_cumcount_inst_type_{x}" for x in subset.columns]
test_inst_cumsum = subset.reset_index()

In [None]:
## cappa 0.41

def diff_inst(df, cols, prefix='_is_diff_prev'):
    pick_cols = ['installation_id','game_session']
    subset = df[pick_cols + cols]
    for col in cols:
        subset[f'_prev_{col}'] = subset.groupby(['installation_id'], sort=False)[col].shift(1)
        subset[f'{prefix}_{col}'] = (subset[col] != subset[f'_prev_{col}']).astype(int)
        subset.drop([col, f'_prev_{col}'], axis=1, inplace=True)
    return subset

# subset = diff_inst(train_sessions, ['type','title'])
# train_inst_sessions = pd.merge(train_inst_cumsum, subset, how='left')

# subset = diff_inst(test_sessions, ['type','title'])
# test_inst_sessions = pd.merge(test_inst_cumsum, subset, how='left')


# train_inst_sessions['_cumsum_inst_diff_title'] = train_inst_sessions.groupby(['installation_id'], sort=False)['_is_diff_prev_title'].cumsum()
# train_inst_sessions['_cumsum_inst_diff_title'] -= train_inst_sessions['_is_diff_prev_title']
# test_inst_sessions['_cumsum_inst_diff_title'] = test_inst_sessions.groupby(['installation_id'], sort=False)['_is_diff_prev_title'].cumsum()
# test_inst_sessions['_cumsum_inst_diff_title'] -= test_inst_sessions['_is_diff_prev_title']


subset = diff_inst(train_asmt_sessions, ['title'], prefix='_is_diff_prev_asmt')
train_inst_sessions = pd.merge(train_inst_cumsum, subset, how='left')

subset = diff_inst(test_asmt_sessions, ['title'], prefix='_is_diff_prev_asmt')
test_inst_sessions = pd.merge(test_inst_cumsum, subset, how='left')

del train_inst_cumsum, test_inst_cumsum
del train_sessions, test_sessions

In [None]:
train_asmt_sessions = pd.merge(train_asmt_sessions, train_inst_sessions, on=['installation_id','game_session'], how='left')
test_asmt_sessions = pd.merge(test_asmt_sessions, test_inst_sessions, on=['installation_id','game_session'], how='left')

In [None]:
del train_inst_sessions, test_inst_sessions

## Prepare

In [None]:
train_asmt_sessions[[c for c in train_asmt_sessions.columns if c not in event_codes]].head().sort_values(by=['installation_id','timestamp']).head()

In [None]:
X_train = train_asmt_sessions.copy()
X_test = test_asmt_sessions.copy()

key_cols = ['installation_id','game_session']
test_last_asmt_sessions = test_asmt_sessions.drop_duplicates(subset=['installation_id'], keep='last')[key_cols]
test_last_asmt_sessions['last_session'] = 1
X_test = pd.merge(X_test, test_last_asmt_sessions, on=key_cols, how='left')

X_train.append(X_test[X_test['last_session'] != 1].drop(['last_session'], axis=1))
X_test = X_test[X_test['last_session'] == 1].drop(['last_session'], axis=1)

In [None]:
_, bins = pd.qcut(X_train['_cumcount_inst_type_Assessment'], 5, retbins=True)
bins[5] = np.inf

X_train['_cumcount_inst_asmt_bin'] = pd.Series(pd.factorize(pd.cut(X_train['_cumcount_inst_type_Assessment'], bins=bins))[0], index=X_train.index)
X_test['_cumcount_inst_asmt_bin'] = test_asmt_group = pd.Series(pd.factorize(pd.cut(X_test['_cumcount_inst_type_Assessment'], bins=bins))[0], index=X_test.index)

In [None]:
train_group = X_train['installation_id']
submission = X_test[['installation_id']]

y_train_org = X_train.pop('accuracy_group')
y_train = np.abs(y_train_org - 3) # for regression
#y_train = y_train_org
X_test = X_test.drop('accuracy_group', axis=1)

X_train.shape, X_test.shape

In [None]:
drop_cols = [
    'installation_id',
    'game_session',
    'type',
    'game_time',
    'timestamp',
    'event_count',
    'accuracy',
    'correct',
    'incorrect'
]

for col in X_train.columns.values:
    counts = X_train[col].value_counts().iloc[0]
    if (counts / X_train.shape[0]) >= 0.99:
        drop_cols.append(col)

X_train.drop(drop_cols, inplace=True, axis=1)
X_test.drop(drop_cols, inplace=True, axis=1)
    
drop_cols

In [None]:
np.sort([c for c in X_train.columns if c not in event_codes]).tolist()

In [None]:
'''
del train_asmt_sessions, test_asmt_sessions
'''

gc.collect()

print(pd.DataFrame([[val for val in dir()], [sys.getsizeof(eval(val)) for val in dir()]],
                   index=['name','size']).T.sort_values('size', ascending=False).reset_index(drop=True)[:10])

In [None]:
from sklearn.preprocessing import LabelEncoder

enc_cols = []
for f, t in X_train.dtypes.iteritems():
    if t == object:
        enc_cols.append(f)
        le = LabelEncoder()
        le.fit(list(set(X_train[f].unique()).union(set(X_test[f].unique()))))
        X_train[f] = le.transform(X_train[f].values).astype(int)
        X_test[f] = le.transform(X_test[f].values).astype(int)
print(enc_cols)

# Predict

In [None]:
from numba import jit
from functools import partial
import scipy as sp

@jit
def qwk(a1, a2):
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])
        return -qwk(y, X_p)

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

    def coefficients(self):
        return self.coef_['x']


def div_by_sum(x):
    return x / x.sum()


def print_divider(text):
    print('\n---------- {} ----------\n'.format(text))


In [None]:
from sklearn.metrics import cohen_kappa_score
from collections import Counter

def eval_qwk_lgb_regr(y_true, y_pred):
    y_pred = y_pred.get_label()
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(y_train)
    for k in dist:
        dist[k] /= len(y_train)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred))).reshape(y_true.shape)

    return 'cappa', qwk(y_true, y_pred), True
#    return 'cappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True

In [None]:
from collections import Counter, defaultdict
from sklearn.utils import check_random_state

class RepeatedStratifiedGroupKFold():

    def __init__(self, n_splits=5, n_repeats=1, random_state=None):
        self.n_splits = n_splits
        self.n_repeats = n_repeats
        self.random_state = random_state
        
    # Implementation based on this kaggle kernel:
    #    https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
    def split(self, X, y=None, groups=None):
        k = self.n_splits
        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std(
                    [y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]
                )
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
            
        rnd = check_random_state(self.random_state)
        for repeat in range(self.n_repeats):
            labels_num = np.max(y) + 1
            y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
            y_distr = Counter()
            for label, g in zip(y, groups):
                y_counts_per_group[g][label] += 1
                y_distr[label] += 1

            y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
            groups_per_fold = defaultdict(set)
        
            groups_and_y_counts = list(y_counts_per_group.items())
            rnd.shuffle(groups_and_y_counts)

            for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
                best_fold = None
                min_eval = None
                for i in range(k):
                    fold_eval = eval_y_counts_per_fold(y_counts, i)
                    if min_eval is None or fold_eval < min_eval:
                        min_eval = fold_eval
                        best_fold = i
                y_counts_per_fold[best_fold] += y_counts
                groups_per_fold[best_fold].add(g)

            all_groups = set(groups)
            for i in range(k):
                train_groups = all_groups - groups_per_fold[i]
                test_groups = groups_per_fold[i]

                train_indices = [i for i, g in enumerate(groups) if g in train_groups]
                test_indices = [i for i, g in enumerate(groups) if g in test_groups]

                yield train_indices, test_indices

In [None]:
%%time

import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.model_selection import StratifiedShuffleSplit,GroupShuffleSplit,TimeSeriesSplit

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    #'eval_metric': 'cappa',
    'learning_rate': 0.005,
    'bagging_fraction': 0.90,
    'feature_fraction': 0.75,
    #'random_state': 42,
}

fit_params = {
    'num_boost_round': 5000,
    'verbose_eval': 1000,
    'early_stopping_rounds': 100,
}

fi_split = np.zeros(X_train.shape[1])
fi_gain = np.zeros(X_train.shape[1])
oof_pred = np.zeros(len(X_train))
pred_test = np.zeros(len(X_test))
coff_avg = np.zeros(3)
pred_n = np.zeros(len(X_train))

NTIMES=5
NFOLDS=5

#for t in range(NTIMES):
for t in range(1):
    print_divider(f'Time: {t+1}')

    fold_enums = [
        ('RepeatedStratifiedGroupKFold', list(RepeatedStratifiedGroupKFold(n_splits=NFOLDS, n_repeats=NTIMES).split(X_train, y_train, groups=train_group)))
        #('StratifiedKFold', StratifiedKFold(n_splits=NFOLDS, shuffle=True).split(X_train, y_train)),
        #('StratifiedShuffleSplit', StratifiedShuffleSplit(n_splits=NFOLDS).split(X_train, y_train)),
        #('GroupKFold', GroupKFold(n_splits=NFOLDS).split(X_train, groups=train_group)),
        #('GroupShuffleSplit', GroupShuffleSplit(n_splits=NFOLDS).split(X_train, groups=train_group)),
        #('TimeSeriesSplit', TimeSeriesSplit(n_splits=NFOLDS).split(X_train, groups=train_asmt_group))
    ]
    
    for fold_enum in fold_enums:
        print_divider(f'{fold_enum[0]}')
    
        for fold_idx, (idx_trn, idx_val) in enumerate(fold_enum[1]):

            print_divider(f'Fold: {fold_idx+1}')
            n = len(fold_enums) * NTIMES * NFOLDS

            X_trn, X_val = X_train.iloc[idx_trn], X_train.iloc[idx_val]
            y_trn, y_val = y_train[idx_trn], y_train[idx_val]

            d_trn = lgb.Dataset(X_trn, y_trn)
            d_val = lgb.Dataset(X_val, y_val)
            model = lgb.train(params, d_trn,
                              valid_sets=[d_trn, d_val],
                              valid_names=['train', 'valid'],
                              #feval=eval_qwk_lgb_regr, 
                              categorical_feature=enc_cols,
                              **fit_params)
            fi_split += div_by_sum(model.feature_importance(importance_type='split')) / n
            fi_gain += div_by_sum(model.feature_importance(importance_type='gain')) / n
            pred_train = model.predict(X_trn)
            pred_test += model.predict(X_test) / n
            
            oof_pred[idx_val] += model.predict(X_val)
            pred_n[idx_val] += 1

            optr = OptimizedRounder()
            optr.fit(pred_train, y_trn)
            coff_avg += optr.coefficients() / n
            print('\nround coefficients:', optr.coefficients())

            del X_trn, y_trn, X_val, y_val
            gc.collect()

feature_importances = pd.DataFrame()
#feature_importances['feature'] = np.array(model.feature_name())
feature_importances['feature'] = np.array(X_train.columns)
feature_importances['average_split'] = fi_split
feature_importances['average_gain'] = fi_gain

In [None]:
oof_pred_round = optr.predict(oof_pred / pred_n, coff_avg)
qwk(y_train, oof_pred_round)

In [None]:
_='''
sub_preds = optr.predict(pred_test, coff_avg)
'''
dist = Counter(y_train)
for k in dist:
    dist[k] /= len(y_train)

acum = 0
bound = {}
for i in range(3):
    acum += dist[i]
    bound[i] = np.percentile(pred_test, acum * 100)
print(bound)

def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3
    
sub_preds = np.array(list(map(classify, pred_test)))

In [None]:
feature_importances.sort_values(by='average_split', ascending=False, inplace=True)
plt.figure(figsize=(12, 10))
sns.barplot(data=feature_importances.head(30), x='average_split', y='feature')
plt.title('TOP feature importance');

In [None]:
submission['accuracy_group'] = np.abs(sub_preds.astype('int') - 3)  # reverse regression value
#submission['accuracy_group'] = sub_preds.astype('int')
submission.to_csv('submission.csv', index=False)

fig, ax = plt.subplots(1, 2, figsize=(13,4))

y_train_org.value_counts().plot.bar(title='y_train', ax=ax[0])
submission['accuracy_group'].value_counts().plot.bar(title='predict', ax=ax[1], color='limegreen')
submission['accuracy_group'].value_counts()