In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostRegressor
from matplotlib import pyplot
import shap

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold
import gc
import json
pd.set_option('display.max_columns', 1000)

# Objective

* In the last notebook we create our baseline model including a feature selection part. 
* Cohen cappa score of 0.456 (lb) with a local cv score of 0.529
* In this notebook we are going to add more features and remove others that i think they overfitt the train set and then check if our local cv score improve.
* Next, we will check if this improvement aligns with the lb.

# Notes
* Check the distribution of the target variable of the out of folds score and the prediction distribution. A good model should more or less have the same distribution.

In [None]:
def cnt_miss(df):
    cnt = 0
    for e in range(len(df)):
        x = df['event_data'].iloc[e]
        y = json.loads(x)['misses']
        cnt += y
    return cnt

def get_4020_acc(df,counter_dict,session_title_text):
    
    #df = df[(df.event_code == 4020)]
    
    #for e in ['Cauldron Filler (Assessment)','Bird Measurer (Assessment)','Mushroom Sorter (Assessment)','Chest Sorter (Assessment)']:
                
    Assess_4020 = df[(df.event_code == 4020)]
    true_attempts_ = Assess_4020['event_data'].str.contains('true').sum()
    false_attempts_ = Assess_4020['event_data'].str.contains('false').sum()

    measure_assess_accuracy_ = true_attempts_/(true_attempts_+false_attempts_) if (true_attempts_+false_attempts_) != 0 else 0
    counter_dict["acc_4020_"+session_title_text] = (counter_dict["acc_4020_"+session_title_text] + measure_assess_accuracy_) / 2.0
    
    return counter_dict

def get_4020_acc_old(df,counter_dict):
    
    df = df[(df.event_code == 4020)]
    
    for e in ['Cauldron Filler (Assessment)','Bird Measurer (Assessment)','Mushroom Sorter (Assessment)','Chest Sorter (Assessment)']:
                
        Assess_4020 = df[(df.title==activities_map[e])]   
        true_attempts_ = Assess_4020['event_data'].str.contains('true').sum()
        false_attempts_ = Assess_4020['event_data'].str.contains('false').sum()

        measure_assess_accuracy_ = true_attempts_/(true_attempts_+false_attempts_) if (true_attempts_+false_attempts_) != 0 else 0
        counter_dict[e+"_4020_accuracy"] = (counter_dict[e+"_4020_accuracy"] + measure_assess_accuracy_) / 2.0
    
    return counter_dict

In [None]:
def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(reduce_train['accuracy_group'])
    for k in dist:
        dist[k] /= len(reduce_train)
    reduce_train['accuracy_group'].hist()
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred))).reshape(y_true.shape)

    return 'cappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True

In [None]:
def cohenkappa(ypred, y):
    y = y.get_label().astype("int")
    ypred = ypred.reshape((4, -1)).argmax(axis = 0)
    loss = cohenkappascore(y, y_pred, weights = 'quadratic')
    return "cappa", loss, True

In [None]:
def read_data():
    print('Reading train.csv file....')
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    return train, test, train_labels, specs, sample_submission

In [None]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = sorted(list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique())))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = sorted(list(set(train['title'].unique()).union(set(test['title'].unique()))))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = sorted(list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))))
    list_of_event_id = sorted(list(set(train['event_id'].unique()).union(set(test['event_id'].unique()))))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = sorted(list(set(train['world'].unique()).union(set(test['world'].unique()))))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = sorted(list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index))))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_map

In [None]:
clip_time = {'Welcome to Lost Lagoon!':19,'Tree Top City - Level 1':17,'Ordering Spheres':61, 'Costume Box':61,
        '12 Monkeys':109,'Tree Top City - Level 2':25, 'Pirate\'s Tale':80, 'Treasure Map':156,'Tree Top City - Level 3':26,
        'Rulers':126, 'Magma Peak - Level 1':20, 'Slop Problem':60, 'Magma Peak - Level 2':22, 'Crystal Caves - Level 1':18,
        'Balancing Act':72, 'Lifting Heavy Things':118,'Crystal Caves - Level 2':24, 'Honey Cake':142, 'Crystal Caves - Level 3':19,
        'Heavy, Heavier, Heaviest':61}

In [None]:
press_done_btn_event_id = ['d122731b', '17113b36', '25fa8af4', '93b353f2', '070a5291', '392e14df']
incorrect_feedback_id = ['df4fe8b6', 'd88e8f25', 'c277e121', '160654fd', 'ea296733', '5859dfb6', 'e04fb33d', '28a4eb9a', '7423acbc', 'e57dd7af', '04df9b66', '2230fab4', 'c51d8688', '1af8be29', '89aace00', '763fc34e', '5290eab1', '90ea0bac', '8b757ab8', 'e5734469', '9de5e594', 'd45ed6a1', 'ac92046e', 'ad2fc29c', '5de79a6a', '88d4a5be', '907a054b', 'e37a2b78', '31973d56', '44cb4907', '0330ab6a', '3bf1cf26']
correct_feedback_id = ['2b9272f4', '47026d5f', '3afde5dd', 'e720d930', '3ddc79c3', '709b1251', '4d911100', '45d01abe', '6f4adc4b', 'cf7638f3', 'd3268efa', 'ecaab346', 'e5c9df6f', '77ead60d', 'a8a78786', '9b4001e4', '3afb49e6', 'b5053438', '250513af', '55115cbd', 'c7fe2a55', 'c74f40cd', 'e4f1efe6', '73757a5e', 'cb6010f8', 'e3ff61fb', '7525289a', 'daac11b0', 'a8876db3', '9d29771f', '1f19558b', '58a0de5c']
correctness_of_the_action_event_id = ['8fee50e2', '30614231', '5f0eb72c', 'd122731b', 'a5e9da97', '0db6d71d', '91561152', '14de4c5d', '8af75982', 'c0415e5c', '3bb91dda', '17113b36', '25fa8af4', '93b353f2', '4ef8cdd3', '2dc29e21', '74e5f8a7', '262136f4', '804ee27f', '070a5291', '392e14df', '5c3d2b2f', '86c924c4', 'e7561dd2']
exit_game_event_id = ['a8cc6fec', '1b54d27f', 'b738d3d3', '3393b68b', '2b058fe3', '9565bea6', '17ca3959', 'a5be6304', '222660ff', '4074bac2', 'b2e5b0f1', '003cd2ee']
help_btn_event_id = ['93edfe2e', '6043a2b4', '05ad839b', '37937459', '6aeafed4', '6f8106d9', '77c76bc5', 'f54238ee', '4e5fc6f5', '47f43a44', '85d1b0de', '47efca07', '731c0cbe', 'e080a381', '92687c59', 'eb2c19cd', '6f4bd64e', '08ff79ad', 'cb1178ad', '67aa2ada', '19967db1', 'e7e44842', '8d748b58', 'd3640339']
beat_round_event_id = ['2b9272f4', '47026d5f', '56817e2b', '3afde5dd', '28520915', 'e720d930', '3ddc79c3', '53c6e11a', '709b1251', '4d911100', '45d01abe', '6f4adc4b', '08fd73f3', '6c930e6e', 'b74258a0', '37c53127', 'cf7638f3', 'd3268efa', 'ecaab346', 'e9c52111', 'e5c9df6f', 'f5b8c21a', '86ba578b', '77ead60d', 'a8a78786', '9b4001e4', '3afb49e6', 'b5053438', '250513af', '4d6737eb', 'b012cd7f', '00c73085', '55115cbd', 'c7fe2a55', 'c74f40cd', 'e4f1efe6', '73757a5e', 'cb6010f8', 'e3ff61fb', '7525289a', 'ca11f653', '895865f3', 'daac11b0', 'a8876db3', '9d29771f', '9ed8f6da', '1f19558b', '58a0de5c', '36fa3ebe', '1c178d24', 'f6947f54', '16dffff1', '83c6c409']
#skip_btn_event_id = ['53c6e11a', 'c1cac9a2', 'd2659ab4', '3bb91ced', '9ed8f6da']

imp_event_id_list = list(set().union(press_done_btn_event_id,incorrect_feedback_id,correct_feedback_id,correctness_of_the_action_event_id,exit_game_event_id,help_btn_event_id,beat_round_event_id))
len(imp_event_id_list)

In [None]:
def get_data(user_sample, test_set=False):
    
    last_activity = 0
    # Constants and parameters declaration
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    #New Code Added by Bhavika
    count_assessment_title = {'cnt_' + title: 0 for title in assess_titles}
    
    game_time_dict = {'Clip_gametime':0, 'Game_gametime':0, 'Activity_gametime':0, 'Assessment_gametime':0}
    
    #assess_4020_acc_dict = {'Cauldron Filler (Assessment)_4020_accuracy':0,
    #                            'Mushroom Sorter (Assessment)_4020_accuracy':0,
    #                            'Bird Measurer (Assessment)_4020_accuracy':0,
    #                            'Chest Sorter (Assessment)_4020_accuracy':0 }
    
    assess_4020_acc_dict = {'acc_4020_' + title: 0 for title in assess_titles}
    
    accumulated_game_miss = 0
    mean_game_round = 0
    #mean_game_level = 0
    Assessment_mean_event_count = 0
    Game_mean_event_count = 0
    Activity_mean_event_count = 0
    
    last_activity_event_count = 0
    last_game_event_count = 0
    last_assessment_event_count = 0
    
    prev_assessment_timestamp = 0
    session_title_of_last_assessment = -1
    session_title_of_last_clip = -1
    session_title_of_last_activity = -1
    session_title_of_last_game = -1
    
    duration_of_last_assessment = 0
    duration_of_last_game = 0
    duration_of_last_activity = 0
    last_assessment_4070_cnt = 0
    last_assessment_3020_cnt = 0
    last_assessment_3121_cnt = 0
    last_assessment_3021_cnt = 0
    last_assessment_3120_cnt = 0
    last_game_miss = 0
    last_game_round = 0
    last_game_level = 0
    last_GAME_duration = 0
    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    
    durations = []
    
    # *******************Added Clip Duration, Activity Duration and Game Duration 
    # *******************Ref : https://www.kaggle.com/khoongweihao/data-science-bowl-2019-regression-to-convert-lb
    
    clip_durations = []
    #Activity_durations = []
    #Game_durations = []
    #**************************Clip Duration Added
    
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    
    event_id_count: Dict[str, int] = {eve: 0 for eve in imp_event_id_list}
    
    #title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()} 
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
       
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        
         
        # *******************Added Clip Duration, Activity Duration and Game Duration 
        # *******************Ref : https://www.kaggle.com/khoongweihao/data-science-bowl-2019-regression-to-convert-lb
        if session_type == 'Clip':
            session_title_of_last_clip = session['title'].iloc[0]
            clip_durations.append((clip_time[activities_labels[session_title]]))
        # *******************Added Clip Duration
                    
        if session_type=="Activity":
            duration_of_last_activity = (session.iloc[-1, 2] - session.iloc[0, 2] ).seconds
            session_title_of_last_activity = session['title'].iloc[0]
            Activity_mean_event_count = (Activity_mean_event_count + session['event_count'].iloc[-1])/2.0
            last_activity_event_count = session['event_count'].iloc[-1]
            
        if session_type=="Game":
            duration_of_last_game = (session.iloc[-1, 2] - session.iloc[0, 2] ).seconds
            session_title_of_last_game = session['title'].iloc[0]
            Game_mean_event_count = (Game_mean_event_count + session['event_count'].iloc[-1])/2.0
            last_game_event_count = session['event_count'].iloc[-1]
            
            game_s = session[session.event_code == 2030]   
            misses_cnt = cnt_miss(game_s)
            accumulated_game_miss += misses_cnt
            
            last_game_miss = misses_cnt
            
            try:
                game_round = json.loads(session['event_data'].iloc[-1])["round"]
                mean_game_round =  (mean_game_round + game_round)/2.0
                
                last_game_round = game_round
            except:
                pass

            try:
                game_level = json.loads(session['event_data'].iloc[-1])["level"]
                #mean_game_level = (mean_game_level + game_level) /2.0
                
                last_game_level = game_level
            except:
                pass
            
            try:
                game_duration = json.loads(g_session['event_data'].iloc[-1])["duration"]
                last_GAME_duration = game_duration
            except:
                pass 
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(event_id_count.copy())
            #features.update(title_count.copy())
            features.update(title_event_code_count.copy())
            features.update(last_accuracy_title.copy())
            features.update(game_time_dict.copy())
            features.update(assess_4020_acc_dict.copy())
            
            features['accumulated_game_miss'] = accumulated_game_miss
            features['last_game_miss'] = last_game_miss
            features['mean_game_round'] = mean_game_round
            features['last_game_round'] = last_game_round
            #features['mean_game_level'] = mean_game_level
            features['last_game_level'] = last_game_level
            features['last_GAME_duration'] = last_GAME_duration
            features['Assessment_mean_event_count'] = Assessment_mean_event_count
            features['Game_mean_event_count'] = Game_mean_event_count
            features['Activity_mean_event_count'] = Activity_mean_event_count
            
            features['last_assessment_event_count'] = last_assessment_event_count
            features['last_activity_event_count'] = last_activity_event_count
            features['last_game_event_count'] = last_game_event_count
            
            #features['last_assessment_4070_cnt'] = last_assessment_4070_cnt
            features['last_assessment_3020_cnt'] = last_assessment_3020_cnt
            features['last_assessment_3121_cnt'] = last_assessment_3121_cnt
            #features['last_assessment_3021_cnt'] = last_assessment_3021_cnt
            #features['last_assessment_3120_cnt'] = last_assessment_3120_cnt
            #*******************New Features added by Bhavika*******************
            features['session_title_of_last_assessment'] = session_title_of_last_assessment
            features['session_title_of_last_clip'] = session_title_of_last_clip
            features['session_title_of_last_activity'] = session_title_of_last_activity
            features['session_title_of_last_game'] = session_title_of_last_game
            
            features['duration_of_last_assessment'] = duration_of_last_assessment
            features['duration_of_last_game'] = duration_of_last_game
            features['duration_of_last_activity'] = duration_of_last_activity
            
            
            current_assess_timestamp = session['timestamp'].iloc[0]
            
            if prev_assessment_timestamp==0:
                features['duration_between_prev_and_curr_assessment'] = 0
            else:
                features['duration_between_prev_and_curr_assessment'] = (current_assess_timestamp - prev_assessment_timestamp).seconds
            
            prev_assessment_timestamp = session['timestamp'].iloc[-1]
            
            features['prev_assessment_count'] = count_assessment_title['cnt_' + session_title_text]
            
            count_assessment_title['cnt_'+ session_title_text] += 1
            session_title_of_last_assessment = session['title'].iloc[0]
            
            assess_4020_acc_dict = get_4020_acc(session , assess_4020_acc_dict, session_title_text)
            
            
            #******************* Bhavika Features added************************ 
            
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            # add title as feature, remembering that title represents the name of the game
            
            features['session_title'] = session['title'].iloc[0]
            features['session_world'] = session['world'].iloc[0]
            
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            # the time spent in the app so far
            
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            
             
            # *******************Added Clip Duration, Activity Duration and Game Duration 
            # *******************Ref : https://www.kaggle.com/khoongweihao/data-science-bowl-2019-regression-to-convert-lb
            
            if clip_durations == []:
                features['Clip_duration_mean'] = 0
            else:
                features['Clip_duration_mean'] = np.mean(clip_durations)
                
            #*******************Clip Duration Added*************************    
            
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            duration_of_last_assessment = (session.iloc[-1, 2] - session.iloc[0, 2] ).seconds
            
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            
            Assessment_mean_event_count = (Assessment_mean_event_count + session['event_count'].iloc[-1])/2.0
            last_assessment_event_count = session['event_count'].iloc[-1]
            
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        #last_assessment_4070_cnt = len(session[session.event_code==4070])
        last_assessment_3020_cnt = len(session[session.event_code==3020])
        last_assessment_3121_cnt = len(session[session.event_code==3121])
        #last_assessment_3021_cnt = len(session[session.event_code==3021])
        #last_assessment_3120_cnt = len(session[session.event_code==3120])
        
        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    try:
                        counter[x] += num_of_session_count[k]
                    except:
                        pass
                return counter
            
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        #title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')
        
        
        
        if game_time_dict[session_type+'_gametime'] == 0:
            game_time_dict[session_type+'_gametime'] += (session['game_time'].iloc[-1]/1000.0)
        else:
            game_time_dict[session_type+'_gametime'] = (game_time_dict[session_type+'_gametime'] + (session['game_time'].iloc[-1]/1000.0))/2.0
            
        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type 
                        
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [None]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort = False)), total = 17000):
        compiled_train += get_data(user_sample)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), total = 1000):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    categoricals = ['session_title']
    return reduce_train, reduce_test, categoricals

In [None]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv()
        self.verbose = verbose
        self.params = self.get_params()
        self.y_pred, self.score, self.model = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = GroupKFold(n_splits=self.n_splits)
        return cv.split(self.train_df, self.train_df[self.target],self.train_df['installation_id'])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        oof_pred = np.zeros((len(reduce_train), ))
        y_pred = np.zeros((len(reduce_test), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            conv_x_val = self.convert_x(x_val)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            print('Partial score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val, oof_pred[val_idx])[1]))
        _, loss_score, _ = eval_qwk_lgb_regr(self.train_df[self.target], oof_pred)
        if self.verbose:
            print('Our oof cohen kappa score is: ', loss_score)
            
        fig, (ax, ax1) = plt.subplots(1, 2, figsize=[11, 7])
        lgb.plot_importance(model, ax=ax, max_num_features=20, importance_type='split')
        lgb.plot_importance(model, ax=ax1, max_num_features=20, importance_type='gain')
        ax.set_title('Importance by splits')
        ax1.set_title('Importance by gain')
        plt.tight_layout()
        plt.savefig('feature_importance.png')

        return y_pred, loss_score, model

In [None]:
class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
        val_set = lgb.Dataset(x_val, y_val, categorical_feature=self.categoricals)
        return train_set, val_set
        
    def get_params(self):
        params = {'n_estimators':5000,
                    'boosting_type': 'gbdt',
                    'objective': 'regression',
                    'metric': 'rmse',
                    'subsample': 0.75,
                    'subsample_freq': 1,
                    'learning_rate': 0.01,
                    'feature_fraction': 0.9,
                    'max_depth': 15,
                    'lambda_l1': 1,  
                    'lambda_l2': 1,
                    'early_stopping_rounds': 100
                    }
        return params

In [None]:
# read data
train, test, train_labels, specs, sample_submission = read_data()
# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, activities_map = encode_title(train, test, train_labels)
# tranform function to get the train and test set
reduce_train, reduce_test, categoricals = get_train_and_test(train, test)

In [None]:
reduce_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_train.columns]
reduce_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_test.columns]

In [None]:
def stract_hists(feature, train=reduce_train, test=reduce_test, adjust=False, plot=False):
    n_bins = 10
    train_data = train[feature]
    test_data = test[feature]
    if adjust:
        test_data *= train_data.mean() / test_data.mean()
    perc_90 = np.percentile(train_data, 95)
    train_data = np.clip(train_data, 0, perc_90)
    test_data = np.clip(test_data, 0, perc_90)
    train_hist = np.histogram(train_data, bins=n_bins)[0] / len(train_data)
    test_hist = np.histogram(test_data, bins=n_bins)[0] / len(test_data)
    msre = mean_squared_error(train_hist, test_hist)
    if plot:
        print(msre)
        plt.bar(range(n_bins), train_hist, color='blue', alpha=0.5)
        plt.bar(range(n_bins), test_hist, color='red', alpha=0.5)
        plt.show()
    return msre
#stract_hists('Magma Peak - Level 1_2000', adjust=False, plot=True)

In [None]:
# call feature engineering function
features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
features = [x for x in features if x not in ['accuracy_group', 'installation_id']]

In [None]:
counter = 0
to_remove = []
for feat_a in features:
    for feat_b in features:
        if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
            c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
            if c > 0.995:
                counter += 1
                to_remove.append(feat_b)
                print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))

In [None]:
to_exclude = [] 
ajusted_test = reduce_test.copy()
for feature in ajusted_test.columns:
    if feature not in ['accuracy_group', 'installation_id', 'accuracy_group', 'session_title']:
        data = reduce_train[feature]
        train_mean = data.mean()
        data = ajusted_test[feature] 
        test_mean = data.mean()
        try:
            error = stract_hists(feature, adjust=True)
            ajust_factor = train_mean / test_mean
            if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                to_exclude.append(feature)
                print(feature, train_mean, test_mean, error)
            else:
                ajusted_test[feature] *= ajust_factor
        except:
            to_exclude.append(feature)
            print(feature, train_mean, test_mean)

In [None]:
temp_features = [x for x in features if x not in (to_exclude + to_remove)]

In [None]:
from sklearn.feature_selection import VarianceThreshold

temp = reduce_train[temp_features]
print(temp.shape)

sel = VarianceThreshold(threshold=(.85 * (1 - .85)))
sel.fit(temp)

var_thre_col = temp.columns[sel.get_support(indices=True)]

len(var_thre_col)

In [None]:
drop_col = [e for e in temp_features if e not in var_thre_col]
drop_col

In [None]:
features = [x for x in features if x not in (to_exclude + to_remove )]
reduce_train[features].shape

In [None]:

lgb_model = Lgb_Model(reduce_train, ajusted_test, features, categoricals=categoricals)


In [None]:
final_pred = lgb_model.y_pred

In [None]:
dist = Counter(reduce_train['accuracy_group'])
for k in dist:
    dist[k] /= len(reduce_train)
reduce_train['accuracy_group'].hist()

acum = 0
bound = {}
for i in range(3):
    acum += dist[i]
    bound[i] = np.percentile(final_pred, acum * 100)
print(bound)

def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3
    
final_pred = np.array(list(map(classify, final_pred)))

sample_submission['accuracy_group'] = final_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission['accuracy_group'].value_counts(normalize=True)

In [None]:
import shap
shap_values = shap.TreeExplainer(lgb_model.model).shap_values(reduce_train[features])
shap.summary_plot(shap_values, reduce_train[features], plot_type="bar")

In [None]:
fig, (ax, ax1) = plt.subplots(1, 2, figsize=[15, 50])
lgb.plot_importance(lgb_model.model, ax=ax, importance_type='split')
lgb.plot_importance(lgb_model.model, ax=ax1,  importance_type='gain')
ax.set_title('Importance by splits')
ax1.set_title('Importance by gain')
plt.tight_layout()
#plt.savefig('feature_importance.png')