Inspired from:

* https://www.kaggle.com/artgor/quick-and-dirty-regression
* https://www.kaggle.com/pestipeti/memory-efficient-faster-way-to-extract-json-data
* https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved

## Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from matplotlib import pyplot
import shap
import warnings
import os
from time import time
import scipy as sp
from tqdm.auto import tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import cohen_kappa_score, mean_squared_error, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import gc
import json
import copy
from functools import partial

from extract_json_script import extract_event_data

warnings.filterwarnings("ignore")
tqdm.pandas()
pd.set_option('display.max_columns', 1000)
pd.set_option('max_rows', 500)
np.random.seed(47)

## Flags

In [None]:
DEV = False
FEATURE_SELECTION = True

# Data Preparation 1

In [None]:
DTYPES_RAW = {
    'event_id': 'category',
    'game_session': 'object',
    'installation_id': 'object',
    'event_count': np.uint16,
    'event_code': np.uint16,
    'game_time': np.uint32,
    'type': 'category',
    'world': 'category',
    'title': 'category',  
}

train_df = pd.read_csv('../input/data-science-bowl-2019/train.csv', parse_dates=['timestamp'], usecols=list(DTYPES_RAW.keys()) + ['timestamp'], dtype=DTYPES_RAW, engine='c')
test_df = pd.read_csv('../input/data-science-bowl-2019/test.csv', parse_dates=['timestamp'], usecols=list(DTYPES_RAW.keys()) + ['timestamp'], dtype=DTYPES_RAW, engine='c')
train_labels_df = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv', dtype=DTYPES_RAW)

In [None]:
## Removing `installation_id`s with no training labels
print(train_df.shape)
train_df = train_df[train_df['installation_id'].isin(train_labels_df['installation_id'].unique())] 
print(train_df.shape)

In [None]:
# Add assessment counter
def assess_count(df):
    df['Assessment'] = 0
    cond_ind = df.query("type == 'Assessment' and (event_code == 4100 or event_code == 4110)").index
    df.loc[cond_ind, 'Assessment'] = 1
    df['counter'] = df.groupby(['installation_id'])['Assessment'].cumsum() - df['Assessment']
    return df.drop(['Assessment'], axis=1)

# train_df = assess_count(train_df.sort_values(by=['timestamp']))
# test_df = assess_count(test_df.sort_values(by=['timestamp']))

In [None]:
def attempts_count(df):
    count = df.query("event_code == 4100 or event_code == 4110").shape[0]
    df['attempts'] = count
    return df

In [None]:
def attempts_flag(df):
    cond_ind = df.query("event_code == 4100 or event_code == 4110").index
    df.loc[cond_ind, 'attempt'] = 1
    return df

train_df = attempts_flag(train_df)
test_df = attempts_flag(test_df)

In [None]:
nrows = None

def event_data(fname):
    agg_dict = {'level': np.max, 'round': np.max, 'correct': np.sum, 'misses':np.sum}
    extras_df = extract_event_data(filename=f'{fname}.csv', type_defaults=list(agg_dict.keys()), nrows=nrows)
    event_df = pd.merge(globals()[f'{fname}_df'].reindex(['installation_id', 'game_session'], axis=1), extras_df, left_index=True, right_index=True)
    event_summary_df = event_df.groupby(['installation_id', 'game_session'], as_index=False).agg(agg_dict)
#     event_summary_df.reset_index(inplace=True)
    return event_summary_df

In [None]:
# agg_dict = {'counter': np.max, 'game_time': np.max, 'event_count': np.max, 'timestamp': np.max}
agg_dict = {'attempt': np.sum, 'game_time': np.max, 'event_count': np.max, 'timestamp': np.max}

comp_train_df = train_df.groupby(['installation_id', 'game_session', 'title', 'type', 'world'], observed=True, as_index=False).agg(agg_dict)
comp_test_df = test_df.groupby(['installation_id', 'game_session', 'title', 'type', 'world'], observed=True, as_index=False).agg(agg_dict)

In [None]:
comp_train_df = comp_train_df.merge(event_data('train'), on=['installation_id', 'game_session'], how='left')
comp_test_df = comp_test_df.merge(event_data('test'), on=['installation_id', 'game_session'], how='left') 

gc.collect()

### Activities associated with each `Assessment`

In [None]:
def backfill(df):
    df['counter'] = df['counter'].fillna(method='backfill')
    return df.dropna(subset=['counter'], axis=0)

### `comp_train_df`

In [None]:
train_labels_df = train_labels_df.merge(comp_train_df[['installation_id', 'game_session', 'timestamp']], on=['installation_id', 'game_session'], how='left')
train_labels_df['Assessment'] = 1
train_labels_df['counter'] = train_labels_df.sort_values('timestamp').groupby(['installation_id'])['Assessment'].cumsum()
train_labels_df = train_labels_df.drop(['Assessment'], axis=1)

In [None]:
comp_train_df = comp_train_df.merge(train_labels_df[['installation_id', 'game_session', 'counter']], on=['installation_id', 'game_session'], how='left')
comp_train_df = comp_train_df.sort_values(by=['installation_id', 'timestamp'])
comp_train_df = comp_train_df.groupby('installation_id', as_index=False).apply(lambda df: backfill(df)).reset_index(drop=True) 

In [None]:
print(len(comp_train_df.groupby(['installation_id','counter'])))

### `comp_test_df`

In [None]:
comp_test_df.loc[comp_test_df['type'] == 'Assessment', 'Assessment'] = 1
comp_test_df['counter'] = comp_test_df.sort_values('timestamp').groupby(['installation_id'])['Assessment'].cumsum()
comp_test_df = comp_test_df.drop(['Assessment'], axis=1)

In [None]:
comp_test_df = comp_test_df.sort_values(by=['installation_id', 'timestamp'])
comp_test_df = comp_test_df.groupby('installation_id', as_index=False).apply(lambda df: backfill(df)).reset_index(drop=True) 

In [None]:
print(len(comp_test_df.groupby(['installation_id','counter'])))
print(comp_test_df.groupby(['installation_id'], as_index=False).last().shape)

### Merging target labels

In [None]:
## For train
_labels = train_labels_df[['game_session', 'accuracy_group']]
comp_train_assess_df = _labels.merge(right=comp_train_df, on='game_session', how='left')

## For test -- random target labels
_labels = comp_test_df.query("type == 'Assessment'")[['game_session']]
_labels['accuracy_group'] = np.random.randint(low=0, high=4) # random target labels
# _last_assessements = _test_assess.groupby(by='installation_id', as_index=False).last()
comp_test_assess_df = _labels.merge(right=comp_test_df, on='game_session', how='left')

## Aggregating on Assessment Level

In [None]:
def cumulative(cumfunc_dict, column):
    
    try:
        return cumfunc_dict[column.name[1]](column)
    except Exception as e:
        return column

def filter_activity(df):
    """
    Returns `df` segregated into clips (`comp_clip`) and all others (`comp_noclip`)
    """
    
    comp_clip = df.loc[(df['type']=='Clip')]
    comp_noclip = df.loc[(df['type']!='Clip')]
    
#     comp_clip = df.loc[(df['type']=='Clip') & (df['counter']<=25)]
#     comp_noclip = df.loc[(df['type']!='Clip') & (df['counter']<=25)]
    return comp_clip.reset_index(drop=True), comp_noclip.reset_index(drop=True)    

In [None]:
def aggregate_activity(df):
    comp_df_clip, comp_df_noclip = filter_activity(df)
    
    agg_dict_clip = {'title': [pd.Series.nunique, pd.Series.count]} 
    agg_dict_noclip = {'game_time': [np.max, np.sum, np.mean], 'event_count': [np.max, np.sum, np.mean], 'correct': [np.sum], 'misses': [np.sum], 'attempt': [np.sum]}
    pivot_df_noclip = pd.pivot_table(comp_df_noclip, index=['installation_id', 'counter'], values=list(agg_dict_noclip.keys()), columns=['world'], aggfunc=agg_dict_noclip)
    pivot_df_clip = pd.pivot_table(comp_df_clip, index=['installation_id', 'counter'], values=list(agg_dict_clip.keys()), columns=['world'], aggfunc=agg_dict_clip)
    
    pivot_df_noclip.columns = pivot_df_noclip.columns.to_flat_index()
    pivot_df_clip.columns = pivot_df_clip.columns.to_flat_index()
    
    pivot_df_noclip = pivot_df_noclip.reset_index().sort_values(by=['installation_id', 'counter'])
    pivot_df_clip = pivot_df_clip.reset_index().sort_values(by=['installation_id', 'counter'])
    
    aggregate_activity_df = pivot_df_noclip \
                            .merge(pivot_df_clip, on=['installation_id', 'counter'], how='left')
    
    aggregate_activity_df = aggregate_activity_df.dropna(axis=1, how='all') #.fillna(0)
    
    cumfunc_dict = {'amax' : pd.Series.cummax, 'sum': pd.Series.cumsum, 'count': pd.Series.cumsum, 'mean': pd.Series.cumsum}
    cumfunc = partial(cumulative, cumfunc_dict)
    aggregate_activity_df = aggregate_activity_df.groupby(['installation_id']).apply(lambda df: df.apply(cumfunc))

    return aggregate_activity_df

In [None]:
print(comp_train_df.shape)
print(comp_test_df.shape)

In [None]:
drop_cols = ['game_session', 'type', 'world', 'level', 'round', 'correct', 'misses']

aggregate_activity_test_df = aggregate_activity(comp_test_df) \
                                .merge(comp_test_assess_df[['installation_id', 'counter', 'title', 'game_session']], on=['installation_id', 'counter'], how='left')
#                                 .drop(columns=drop_cols)

gc.collect()

aggregate_activity_train_df = aggregate_activity(comp_train_df) \
                                .merge(comp_train_assess_df[['installation_id', 'counter', 'title', 'game_session']], on=['installation_id', 'counter'], how='left')
#                                 .drop(columns=drop_cols)

gc.collect()

### Adding mode as feature

In [None]:
## Most frequent accuracy group per assessement (to get a sense of what 'average' performance is)
mode_per_title = train_labels_df.groupby('title').agg({'accuracy_group' : pd.Series.mode})['accuracy_group']
aggregate_activity_train_df['title_mode'] = aggregate_activity_train_df['title'].map(mode_per_title) 
aggregate_activity_test_df['title_mode'] = aggregate_activity_test_df['title'].map(mode_per_title) 

In [None]:
aggregate_activity_train_df = aggregate_activity_train_df.drop(['title'], axis=1)
aggregate_activity_test_df = aggregate_activity_test_df.drop(['title'], axis=1)

In [None]:
print(aggregate_activity_train_df.shape)
print(aggregate_activity_test_df.shape)

In [None]:
aggregate_activity_features = set(aggregate_activity_train_df.columns)

In [None]:
del train_df, test_df, train_labels_df, comp_test_df, comp_train_df
gc.collect()

# Data Preparation 2

In [None]:
def read_data():
    print('Reading train.csv file....')
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    return train, test, train_labels, specs, sample_submission

In [None]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    # hour
    train['hour'] = train['timestamp'].dt.hour
    test['hour'] = test['timestamp'].dt.hour
    
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code

In [None]:
clip_time = {'Welcome to Lost Lagoon!':19,'Tree Top City - Level 1':17,'Ordering Spheres':61, 'Costume Box':61,
        '12 Monkeys':109,'Tree Top City - Level 2':25, 'Pirate\'s Tale':80, 'Treasure Map':156,'Tree Top City - Level 3':26,
        'Rulers':126, 'Magma Peak - Level 1':20, 'Slop Problem':60, 'Magma Peak - Level 2':22, 'Crystal Caves - Level 1':18,
        'Balancing Act':72, 'Lifting Heavy Things':118,'Crystal Caves - Level 2':24, 'Honey Cake':142, 'Crystal Caves - Level 3':19,
        'Heavy, Heavier, Heaviest':61}

In [None]:
def cnt_miss(df):
    cnt = 0
    for e in range(len(df)):
        x = df['event_data'].iloc[e]
        y = json.loads(x)['misses']
        cnt += y
    return cnt

In [None]:
# this is the function that convert the raw data into processed features
def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    game_time_dict = {'Clip_gametime':0, 'Game_gametime':0, 'Activity_gametime':0, 'Assessment_gametime':0}
    Assessment_mean_event_count = 0
    Game_mean_event_count = 0
    Activity_mean_event_count = 0
    mean_game_round = 0
    mean_game_duration = 0 
    mean_game_level = 0
    accumulated_game_miss = 0
    
    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    clip_durations = []
    Activity_durations = []
    Game_durations = []
    
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
    title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()} 
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
        
    # last features
    sessions_count = 0
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
                    
        if session_type == 'Clip':
            clip_durations.append((clip_time[activities_labels[session_title]]))
        
        if session_type == 'Activity':
            Activity_durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            Activity_mean_event_count = (Activity_mean_event_count + session['event_count'].iloc[-1])/2.0
        
        if session_type == 'Game':
            Game_durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            Game_mean_event_count = (Game_mean_event_count + session['event_count'].iloc[-1])/2.0
            
            game_s = session[session.event_code == 2030]   
            misses_cnt = cnt_miss(game_s)
            accumulated_game_miss += misses_cnt
            
            try:
                game_round = json.loads(session['event_data'].iloc[-1])["round"]
                mean_game_round =  (mean_game_round + game_round)/2.0
            except:
                pass

            try:
                game_duration = json.loads(session['event_data'].iloc[-1])["duration"]
                mean_game_duration = (mean_game_duration + game_duration) /2.0
            except:
                pass
            
            try:
                game_level = json.loads(session['event_data'].iloc[-1])["level"]
                mean_game_level = (mean_game_level + game_level) /2.0
            except:
                pass
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(event_id_count.copy())
            features.update(title_count.copy())
            features.update(title_event_code_count.copy())
            features.update(last_accuracy_title.copy())
            # features.update(game_time_dict.copy())
            
            features['installation_session_count'] = sessions_count
            features['hour'] = session['hour'].iloc[-1]
            features['Assessment_mean_event_count'] = Assessment_mean_event_count
            features['Game_mean_event_count'] = Game_mean_event_count
            features['Activity_mean_event_count'] = Activity_mean_event_count
            features['mean_game_round'] = mean_game_round
            features['mean_game_duration'] = mean_game_duration
            features['mean_game_level'] = mean_game_level
            features['accumulated_game_miss'] = accumulated_game_miss
            
            variety_features = [('var_event_code', event_code_count),
                              ('var_event_id', event_id_count),
                               ('var_title', title_count),
                               ('var_title_event_code', title_event_code_count)]
            
            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                features[name] = np.count_nonzero(arr)
                 
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0]
            features['game_session'] = i
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
                features['duration_std'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
            if clip_durations == []:
                features['Clip_duration_mean'] = 0
                features['Clip_duration_std'] = 0
            else:
                features['Clip_duration_mean'] = np.mean(clip_durations)
                features['Clip_duration_std'] = np.std(clip_durations)
                
            if Activity_durations == []:
                features['Activity_duration_mean'] = 0
                features['Activity_duration_std'] = 0
            else:
                features['Activity_duration_mean'] = np.mean(Activity_durations)
                features['Activity_duration_std'] = np.std(Activity_durations)
                
            if Game_durations == []:
                features['Game_duration_mean'] = 0
                features['Game_duration_std'] = 0
            else:
                features['Game_duration_mean'] = np.mean(Game_durations)
                features['Game_duration_std'] = np.std(Game_durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            Assessment_mean_event_count = (Assessment_mean_event_count + session['event_count'].iloc[-1])/2.0
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        sessions_count += 1
        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    counter[x] += num_of_session_count[k]
                return counter
            
        game_time_dict[session_type+'_gametime'] = (game_time_dict[session_type+'_gametime'] + (session['game_time'].iloc[-1]/1000.0))/2.0
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type 

    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments #[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [None]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    for ins_id, user_sample in tqdm(train.groupby('installation_id', sort = False), total=train['installation_id'].nunique()):
        compiled_train += get_data(user_sample)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), total=test['installation_id'].nunique()):
        compiled_test += get_data(user_sample, True)
        
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    categoricals = ['session_title']
    return reduce_train, reduce_test, categoricals

In [None]:
_train, _test, _train_labels, specs, sample_submission = read_data()
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code = encode_title(_train, _test, _train_labels)

In [None]:
reduce_train, reduce_test, categoricals = get_train_and_test(train, test)

In [None]:
reduce_features = set(reduce_train.columns)

In [None]:
gc.collect()

# Merging Datasets

In [None]:
keys = ['installation_id', 'game_session']
target = ['accuracy_group']

In [None]:
reduce_train = reduce_train.merge(aggregate_activity_train_df, on=keys)
reduce_test = reduce_test.merge(aggregate_activity_test_df, on=keys)

In [None]:
print(reduce_train.shape)
print(reduce_test.shape)

In [None]:
list(reduce_train.select_dtypes(include=['object', 'category']).columns)

### `to_csv`

In [None]:
if DEV:
    reduce_train.to_csv('final_train.csv', index=False)
    reduce_test.to_csv('final_test.csv', index=False)

gc.collect()

# Feature Selection

In [None]:
features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
features = [x for x in features if x not in (target + keys)]
print(len(features))

In [None]:
def remove_correlated(features, threshold=0.995):
    to_remove = []
    counter = 0
    for feat_a in tqdm(features):
        for feat_b in features:
            if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
                c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1] # or [1][0]
                if c > threshold:
                    counter += 1
                    to_remove.append(feat_b)
#                     print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))
                    
    return to_remove

In [None]:
def stract_hists(feature, train=reduce_train, test=reduce_test, adjust=False, plot=False):
    n_bins = 10
    train_data = train[feature]
    test_data = test[feature]
    if adjust:
        test_data *= train_data.mean() / test_data.mean()
    perc_90 = np.percentile(train_data, 95)
    train_data = np.clip(train_data, 0, perc_90)
    test_data = np.clip(test_data, 0, perc_90)
    train_hist = np.histogram(train_data, bins=n_bins)[0] / len(train_data) # or use pd.cut or any other normalisation method
    test_hist = np.histogram(test_data, bins=n_bins)[0] / len(test_data)
    msre = mean_squared_error(train_hist, test_hist)
    if plot:
        print(msre)
        plt.bar(range(n_bins), train_hist, color='blue', alpha=0.5, label='train')
        plt.bar(range(n_bins), test_hist, color='red', alpha=0.5, label='test')
        plt.legend()
        plt.show()
    return msre

# stract_hists('Magma Peak - Level 1_2000', adjust=False, plot=True)

In [None]:
def remove_errored():
    to_exclude = [] 
    ajusted_test = reduce_test.copy()
    for feature in ajusted_test.columns:
        if feature not in (target + keys + categoricals):
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            try:
                error = stract_hists(feature, adjust=True)
                ajust_factor = train_mean / test_mean
                if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01: ## UNDERSTAND
                    to_exclude.append(feature)
#                     print(feature, train_mean, test_mean, error)
                else:
                    ajusted_test[feature] *= ajust_factor
            except:
                to_exclude.append(feature)
#                 print(feature, train_mean, test_mean)

    return to_exclude, ajusted_test

In [None]:
if FEATURE_SELECTION:
    to_remove = remove_correlated(features, threshold=0.990)
    to_exclude, ajusted_test = remove_errored()
    features = [x for x in features if x not in (to_exclude + to_remove)]
    from_1 = set(features) & aggregate_activity_features
    from_2 = set(features) & reduce_features
    print(f"{len(from_1)} features from aggregate_activity_train_df: ", from_1)
    print(f"{len(from_2)} features from aggregate_activity_train_df: ", from_2)
    
else:
    _, ajusted_test = remove_errored()

reduce_train[features].shape

In [None]:
gc.collect()

# Model

In [None]:
def regr_to_label(y_pred):
    
    acums = train_labels['accuracy_group'].value_counts(normalize=True).sort_index().cumsum().to_numpy()
    bound = np.percentile(y_pred, acums*100)
    
#     y_pred = pd.cut(y_pred, [-np.inf] + list(np.sort(bound)) + [np.inf], labels = [0, 1, 2, 3]).reshape(y_pred.shape)
    
    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred))).reshape(y_pred.shape)
    return y_pred


def eval_qwk_lgb_regr(y_true, y_pred):
    
    y_pred = regr_to_label(y_pred).reshape(y_true.shape)
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [None]:
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -cohen_kappa_score(y, X_p, weights='quadratic')

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
#         initial_coef = [0.5, 1.5, 2.5]
        acums = train_labels['accuracy_group'].value_counts(normalize=True).sort_index().cumsum().to_numpy()
        initial_coef = np.percentile(X, acums*100)[:3]
    
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [None]:
## Implement a generalised pipeline, with optional StratifiedKFold

class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, evaluator, params=None, categoricals=[], n_splits=5, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv() if n_splits is not None else None
        self.verbose = verbose
        self.params = params # if params not None else self.get_params()
        self.evaluator = evaluator
        
    def __call__(self):
        self.oof_pred, self.y_pred, self.score, self.model = self.fit()
        return self
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True)
        return cv.split(self.train_df, self.train_df[self.target])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        oof_pred = np.zeros((len(self.train_df), ))
        y_pred = np.zeros((len(self.test_df), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            conv_x_val = self.convert_x(x_val)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
#             print(f'Partial score of fold {fold} is: {self.evaluator(y_val, oof_pred[val_idx])}')
            
        loss_score = self.evaluator(self.train_df[self.target], oof_pred)

        if self.verbose:
            print(f'oof {self.evaluator.__name__} score is {loss_score}')
        return oof_pred, y_pred, loss_score, model

In [None]:
class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
        val_set = lgb.Dataset(x_val, y_val, categorical_feature=self.categoricals)
        return train_set, val_set

In [None]:
class Xgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return xgb.train(self.params, train_set, evals=[(train_set, 'train'), (val_set, 'val')], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set
    
    def convert_x(self, x):
        return xgb.DMatrix(x)

In [None]:
class Mlr_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        clf = LogisticRegression(**self.params)
        return clf.fit(train_set[:, :-1], train_set[:, -1:])

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        
        scaler = StandardScaler()
        X_train_std = scaler.fit_transform(x_train.to_numpy())
        X_val_std = scaler.fit_transform(x_val.to_numpy())
        
        y_train = np.expand_dims(y_train, 1)
        y_val = np.expand_dims(y_val, 1)
        
        train_set = np.concatenate([X_train_std, y_train], axis=1)
        val_set = np.concatenate([X_val_std, y_val], axis=1)
        
        return train_set, val_set

## Hyper-parameter optimisation (Optional)

In [None]:
def ensemble_hyperopt(param_space, x_train, x_val, y_train, y_val, features, categoricals, num_eval):
    
    weights = {'lbg': 0.80, 'xgb': 0.20}

    def objective_function(params):
        lbg_params = {k[5:]: v for k, v in params.items() if k.startswith("l")}
        xbg_params = {k[5:]: v for k, v in params.items() if k.startswith("x")}
#         print(lbg_params, xbg_params)
        lgb_model = Lgb_Model(train_df=None, test_df=None, features=None, evaluator=mean_squared_error, params=lbg_params, categoricals=categoricals, n_splits=None, verbose=False)
        xgb_model = Xgb_Model(train_df=None, test_df=None, features=None, evaluator=mean_squared_error, params=xbg_params, categoricals=categoricals, n_splits=None, verbose=False)
        train_set_lgb, val_set_lgb = lgb_model.convert_dataset(x_train, y_train, x_val, y_val)
        train_set_xgb, val_set_xgb = xgb_model.convert_dataset(x_train, y_train, x_val, y_val)

        lgb_model.model = lgb_model.train_model(train_set_lgb, val_set_lgb)
        xgb_model.model = xgb_model.train_model(train_set_xgb, val_set_xgb)
        regr_pred = (lgb_model.model.predict(x_val).reshape(y_val.shape) * weights['lbg']) + (xgb_model.model.predict(xgb.DMatrix(x_val, y_val)).reshape(y_val.shape) * weights['xgb']) 
        score = mean_squared_error(y_val, regr_pred)
        return score
    
    trials = Trials()
    best_param = fmin(objective_function, 
                      param_space, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials)

    return trials, best_param

In [None]:
def hyperopt(param_space, x_train, x_val, y_train, y_val, features, categoricals, num_eval):
    
    def objective_function(params):
        lgb_model = Lgb_Model(train_df=None, test_df=None, features=None, evaluator=mean_squared_error, params=params, categoricals=categoricals, n_splits=None, verbose=False)
        train_set, val_set = lgb_model.convert_dataset(x_train, y_train, x_val, y_val)
        lgb_model.model = lgb_model.train_model(train_set, val_set)
#         lgb_model = Lgb_Model(X_train, X_val, features=features, categoricals=categoricals, evaluator=mean_squared_error, n_splits=2, verbose=False)
        y_pred = lgb_model.model.predict(x_val).reshape(y_val.shape)
        score = mean_squared_error(y_val, y_pred)
        return score
    
    trials = Trials()
    best_param = fmin(objective_function, 
                      param_space, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials)

    return trials, best_param

In [None]:
def process_params(param):
    new_param = {}
    dtypes = {
        'bagging_fraction': float,
        'feature_fraction': float,
        'learning_rate': float,
        'max_depth': int,
        'n_estimators': int,
        'num_leaves': int,
        'lambda_l1': float,
        'lambda_l2': float,
        'cat_smooth': int
    }
        
    for k in param.keys():
        new_param[k] = dtypes[k](param[k])
#     new_param['boosting'] = 'gbdt' if param['boosting'] == 0 else 'dart'    

    return new_param

In [None]:
max_depth = scope.int(hp.quniform('max_depth', 5, 100, 1))
bagging_fraction = hp.uniform('bagging_fraction', 0.5, 1.0)
feature_fraction = hp.uniform('feature_fraction', 0.1, 1.0)
lambda_l1 = hp.uniform('lambda_l1', 0.0, 100.0)
lambda_l2 = hp.uniform('lambda_l2', 0.0, 100.0)
cat_smooth = scope.int(hp.quniform('cat_smooth', 10, 100, 1))
boosting = hp.choice('boosting', ['gbdt', 'dart'])
num_iterations = scope.int(hp.quniform('num_iterations', 50, 500, 25))
learning_rate = hp.loguniform('learning_rate', np.log(0.01), np.log(1))
num_leaves = scope.int(hp.quniform('num_leaves', 5, 100, 1))
n_estimators = scope.int(hp.quniform('n_estimators', 100, 10000, 50))

In [None]:
comma_names = {col: col.replace(',', '') for col in train.filter(like=',', axis=1).columns}
reduce_train = reduce_train.rename(columns=comma_names)
reduce_test = reduce_test.rename(columns=comma_names)

In [None]:
split_pct = 0.15
split_size = int(split_pct * len(reduce_train))

# indices = np.random.randint(low=0, high=len(reduce_train) -1, size=(split_size,))
# X_train = reduce_train.drop(index=indices)
# X_val = reduce_train.filter(items=indices, axis=0)

x_train, x_val, y_train, y_val = train_test_split(
    reduce_train[features],
    reduce_train['accuracy_group'],
    test_size=split_pct
)

In [None]:
learnable_params= {
    'learning_rate': learning_rate,
    'n_estimators': n_estimators,
    'num_leaves': num_leaves,
    'bagging_fraction': bagging_fraction,
    'feature_fraction': feature_fraction,
    'lambda_l1': lambda_l1,
    'lambda_l2': lambda_l2,
    'cat_smooth': cat_smooth
}

static_params = {'objective': 'regression', 'metric': 'mse', 'n_estimators': 13000, 'early_stopping_round': 10}
param_hyperopt = {**static_params, **learnable_params}

max_eval = 200
# _, para = hyperopt(param_hyperopt, x_train, y_train.astype(int), x_val, y_val.astype(int), categoricals, max_eval)
# _, para = hyperopt(param_space, x_train, x_val, y_train, y_val, features, categoricals, max_eval)

In [None]:
other_params= {
    'lgb__lambda_l2': hp.uniform('lambda_l2', 0.0, 100.0),
    'xgb__lambda': hp.uniform('lambda', 0.0, 100.0),
}

common_params = {
    'objective': ['mean_squared_error', 'reg:squarederror'],
    'n_estimators': [6700, 5000],
    'early_stopping_round': [100, 100],
    'max_depth': [-1, 0],
    'learning_rate': [0.01, 0.01]
}

suffixes = ['lbg__', 'xgb__']
common_params = {suffixes[i] + k: v[i] for k, v in common_params.items() for i in range(2)}
param_hyperopt = {**common_params, **other_params}

max_eval = 100
# _, para = ensemble_hyperopt(param_hyperopt, x_train, x_val, y_train, y_val, features, categoricals, max_eval)
## TD: Split para into params_lgb and params_xgb 

## Training

In [None]:
params_lgb = {
    'n_estimators':5000,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'subsample': 0.75,
    'subsample_freq': 1,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'max_depth': 15,
    'lambda_l1': 1,  
    'lambda_l2': 1,
    'early_stopping_rounds': 100
 }
    
    
params_xgb = {
    'colsample_bytree': 0.8,                 
    'learning_rate': 0.01,
    'max_depth': 10,
    'objective':'reg:squarederror',
    'min_child_weight':3,
    'gamma':0.25,
    'n_estimators':5000
}

In [None]:
# params_lgb = {
#     'objective': 'mean_squared_error',
#     'learning_rate': 0.01,
#     'n_estimators': 6700,
#     'early_stopping_round': 100,
#     'feature_fraction': 0.8
    
# }

# params_xgb = {
#     'objective':'reg:squarederror',
#     'learning_rate': 0.01,
#     'n_estimators': 6700,
#     'early_stopping_round': 100,
#     'max_depth': 0,
#     'colsample_bytree': 0.8,
# }

In [None]:
print(params_lgb)
print(params_xgb)

In [None]:
evaluator = eval_qwk_lgb_regr
qwk = lambda y_true, y_pred: cohen_kappa_score(y_true, y_pred, weights='quadratic')
xgb_model = Xgb_Model(reduce_train, ajusted_test, features=features, categoricals=categoricals, params=params_xgb, evaluator=evaluator, verbose=True)()
lgb_model = Lgb_Model(reduce_train, ajusted_test, features=features, categoricals=categoricals, params=params_lgb, evaluator=evaluator, verbose=True)()

In [None]:
weights = {'lbg': 0.80, 'xgb': 0.20}
regr_pred = (lgb_model.y_pred * weights['lbg']) + (xgb_model.y_pred * weights['xgb']) 

In [None]:
# regr_pred = lgb_model.y_pred
# regr_pred = xgb_model.y_pred

### Performance on training set

In [None]:
preds_lgb = lgb_model.model.predict(reduce_train[features])
preds_xgb = xgb_model.model.predict(xgb.DMatrix(reduce_train[features], reduce_train['accuracy_group']))

preds = (preds_lgb * weights['lbg']) + (preds_xgb * weights['xgb']) 

### Using `OptimizedRounder`

In [None]:
optR = OptimizedRounder()
optR.fit(preds.reshape(-1,), reduce_train['accuracy_group'].values.reshape(-1,))
coefficients = optR.coefficients()
print("OptimizedRounder qwk = ", qwk(reduce_train['accuracy_group'].values, optR.predict(preds.reshape(-1, ), coefficients)))

In [None]:
print(classification_report(reduce_train['accuracy_group'].values, optR.predict(preds.reshape(-1, ), coefficients), output_dict=True))

In [None]:
print("eval_qwk_lgb_regr qwk = ", eval_qwk_lgb_regr(reduce_train['accuracy_group'].values, preds.reshape(-1, )))

In [None]:
print(classification_report(reduce_train['accuracy_group'].values, regr_to_label(preds.reshape(-1, )), output_dict=True))

# Submission

In [None]:
# final_pred = regr_to_label(regr_pred)

In [None]:
final_pred = optR.predict(regr_pred.reshape(-1, ), coefficients)

In [None]:
print(final_pred.shape)

In [None]:
reduce_test[target[0]] = final_pred
final_pred_df = reduce_test[keys + target]

In [None]:
def mode0(x): return pd.Series.mode(x)[0]
mode1 = lambda x: pd.Series.mode(x)[1] if len(pd.Series.mode(x)) > 1 else pd.Series.mode(x)[0]
def last(x): return x.iloc[-1]
def first(x): return x.iloc[0]
final_pred_df = final_pred_df.groupby('installation_id').agg({'accuracy_group': [np.max, mode0, np.min, last, first, mode1]})

In [None]:
final_pred_df.hist(figsize=(10, 10));

In [None]:
sample_submission['accuracy_group'] = final_pred_df.iloc[:, 3].values.astype(int)
sample_submission.to_csv('submission.csv', index=False)

In [None]:
print(sample_submission['accuracy_group'].value_counts(normalize=True))
sample_submission['accuracy_group'].hist();