In [None]:
import os
import gc
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib.pylab as plt
import itertools

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

from lightgbm import LGBMRegressor

DATA_DIR = r'/kaggle/input/data-science-bowl-2019'

SPECS_CSV_PATH = os.path.join(DATA_DIR, 'specs.csv')
TRAIN_CSV_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_CSV_PATH = os.path.join(DATA_DIR, 'test.csv')

In [None]:
def calculate_accuracy_group(x):
    '''
    x: pandas.Series
    
    returns: int
    '''
    if x['num_correct'] == 1 and x['num_incorrect'] == 0:
        return 3
    elif x['num_correct'] == 1 and x['num_incorrect'] == 1:
        return 2
    elif x['num_correct'] == 1:
        return 1
    else:
        return 0
    
def std_from_cumsums(x):
    
    N = x.iloc[0]
    if N <= 1:
        return 0
    
    s = x.iloc[1]
    sq_sum = x.iloc[2]
        
    return np.sqrt((sq_sum - s**2/N)/(N-1))

EVENT_IDS = pd.read_csv(SPECS_CSV_PATH, usecols=['event_id']).event_id
    
title_to_event_id_with_correct = {
    'Sandcastle Builder (Activity)': ['9ee1c98c'],
    'Scrub-A-Dub': ['5c3d2b2f'],
    'Dino Drink': ['74e5f8a7', '6f8106d9'],
    'Watering Hole (Activity)': ['2fb91ec1'],
    'All Star Sorting': ['2dc29e21'],
    'Fireworks (Activity)': ['e694a35b'],
    'Mushroom Sorter (Assessment)': ['5f0eb72c', '25fa8af4'],
    'Air Show': ['28f975ea', '14de4c5d'],
    'Crystals Rule': ['86c924c4'],
    'Bird Measurer (Assessment)': ['4a4c3d21', '17113b36', '8fee50e2', '070a5291'],
    'Bubble Bath': ['3bb91dda'],
    'Bottle Filler (Activity)': ['90efca10'],
    'Dino Dive': ['c0415e5c'],
    'Chow Time': ['4ef8cdd3'],
    'Cauldron Filler (Assessment)': ['30614231', '392e14df', '91561152'],
    'Pan Balance': ['804ee27f', 'a5e9da97', 'e7561dd2'],
    'Happy Camel': ['8af75982'],
    'Cart Balancer (Assessment)': ['d122731b'],
    'Chest Sorter (Assessment)': ['0db6d71d', '93b353f2'],
    'Leaf Leader': ['262136f4']
 }

# potentialy expand with private test set
#relevant_columns = ['event_id', 'event_data', 'title']
#tmp_test = pd.read_csv(TEST_CSV_PATH, usecols=relevant_columns)
#
#tmp_test['correct_true'] = tmp_test.event_data.str.contains('"(?:correct|launched|jar_filled|filled)":true').astype(int)
#tmp_test['correct_false'] = tmp_test.event_data.str.contains('"(?:correct|launched|jar_filled|filled)":false').astype(int)
#
#for title in tmp_test.title.unique():
#    lst_ev_id = []
#    title_tmp_test = tmp_test.loc[tmp_test.title == title]
#    for ev_id in title_tmp_test.event_id.unique():
#        if title_tmp_test.loc[title_tmp_test.event_id == ev_id].correct_true.any() or title_tmp_test.loc[title_tmp_test.event_id == ev_id].correct_false.any():
#            lst_ev_id.append(ev_id)
#    if lst_ev_id:
#        if title not in title_to_event_id_with_correct.keys():
#            title_to_event_id_with_correct[title] = lst_ev_id
#        else:
#            for event_id in lst_ev_id:
#                if event_id not in title_to_event_id_with_correct[title]:
#                    title_to_event_id_with_correct[title].append(event_id) 
#
#del tmp_test
#gc.collect()

# list of event_id that contain correct:true or correct:false
event_ids_with_correct = sorted(set(itertools.chain(*title_to_event_id_with_correct.values())))

title_to_event_id_with_duration = {
    'Sandcastle Builder (Activity)': ['1bb5fbdb', 'c58186bf', '9ee1c98c', '84538528'],
    'Scrub-A-Dub': ['5a848010', '4a09ace1', 'f7e47413', '08fd73f3', '2b9272f4', '37c53127', 'ac92046e', 'c1cac9a2'],
    'Dino Drink': ['7f0836bf', '4d6737eb', '89aace00', '4d911100', '16dffff1', '9ed8f6da'],
    'Watering Hole (Activity)': ['bd701df8', '71e712d8'],
    'All Star Sorting': ['9e4c8c7b', '2dc29e21', 'd45ed6a1', 'ca11f653', '1f19558b', 'd02b7a8e'],
    'Fireworks (Activity)': ['e694a35b', 'b88f38da'],
    'Flower Waterer (Activity)': ['bbfe0445', 'de26c3a6', '598f4598', 'fcfdffb6'],
    'Mushroom Sorter (Assessment)': ['a52b92d5', 'c74f40cd', '83c6c409', '5f0eb72c', 'c7128948', '6c930e6e', '88d4a5be', '0d18d96c'],
    'Air Show': ['a1bbe385', 'd88ca108', '58a0de5c', 'f5b8c21a', 'e04fb33d', 'd2659ab4'],
    'Crystals Rule': ['3babcb9b', 'e720d930', '3323d7e9', '8b757ab8', 'a1192f43'],
    'Bug Measurer (Activity)': ['71fe8f75', '363c86c9', '022b4259'],
    'Bird Measurer (Assessment)': ['bdf49a58', '4a4c3d21', 'e37a2b78', '7525289a', 'f6947f54', '8fee50e2', 'a76029ee', 'd38c2fd7'],
    'Bubble Bath': ['99abe2bb', '99ea62f3', '15eb4a7d', '55115cbd', '5859dfb6', 'd06f75b5', '895865f3'],
    'Bottle Filler (Activity)': ['df4940d3', '90efca10', 'd3f1e122', 'e9c52111'],
    'Dino Dive': ['ab3136ba', '709b1251', '6088b756', '28a4eb9a', '00c73085'],
    'Chow Time': ['0d1da71f', 'd185d3ea', '4ef8cdd3', '2230fab4', '56817e2b', 'cb6010f8'],
    'Cauldron Filler (Assessment)': ['30614231', '2dcad279', '392e14df', '28520915', 'b5053438', '5290eab1', '3edf6747', '5348fd84'],
    'Chicken Balancer (Activity)': ['84b0e0c8', '499edb7c', 'cdd22e43', '46cd75b4'],
    'Pan Balance': ['15f99afc', '804ee27f', 'c51d8688', 'e7561dd2', 'bc8f2793', '1c178d24', 'cf7638f3'],
    'Happy Camel': ['d51b1749', '3bb91ced', '69fdac0a', '36fa3ebe', 'a8a78786', '3bf1cf26', 'c189aaf2'],
    'Cart Balancer (Assessment)': ['5c2f29ca', '828e68f9', 'ecaab346', 'b74258a0', '9d4e7b25', '3d63345e', '31973d56'],
    'Egg Dropper (Activity)': ['b80e5e84'],
    'Chest Sorter (Assessment)': ['3ccd3f02', '562cec5f', '3d0b9317', '0db6d71d', '9ce586dd', 'df4fe8b6', 'e4f1efe6', '38074c54'],
    'Leaf Leader': ['2a512369', '86ba578b', 'b012cd7f', 'e5c9df6f', 'e57dd7af', '53c6e11a']
    }

#relevant_columns = ['event_id', 'event_data', 'title']
#tmp_test = pd.read_csv(TEST_CSV_PATH, usecols=relevant_columns)
#
#tmp_test['duration'] = tmp_test.event_data.str.extract('"duration":(\d*)').fillna(0).astype(int)
#
#for title in tmp_test.title.unique():
#    ev_ids = []
#    title_tmp_test = tmp_test.loc[tmp_test.loc[:, 'title'] == title]
#    for ev_id in title_tmp_test.event_id.unique():
#        if (title_tmp_test[title_tmp_test.loc[:, 'event_id'] == ev_id].duration > 0).any():
#            ev_ids.append(ev_id)
#    if ev_ids:
#        if title not in title_to_event_id_with_duration.keys():
#            title_to_event_id_with_duration[title] = ev_ids
#        else:
#            for event_id in ev_ids:
#                if event_id not in title_to_event_id_with_duration[title]:
#                    title_to_event_id_with_duration[title].append(event_id)
#
#del tmp_test
#gc.collect()

event_ids_with_duration = sorted(set(itertools.chain(*title_to_event_id_with_duration.values())))

title_to_type = {
     "Welcome to Lost Lagoon!": "Clip",
     "Magma Peak - Level 1": "Clip",
     "Sandcastle Builder (Activity)": "Activity",
     "Scrub-A-Dub": "Game",
     "Magma Peak - Level 2": "Clip",
     "Dino Drink": "Game",
     "Tree Top City - Level 1": "Clip",
     "Ordering Spheres": "Clip",
     "Watering Hole (Activity)": "Activity",
     "Slop Problem": "Clip",
     "All Star Sorting": "Game",
     "Costume Box": "Clip",
     "Fireworks (Activity)": "Activity",
     "12 Monkeys": "Clip",
     "Tree Top City - Level 2": "Clip",
     "Flower Waterer (Activity)": "Activity",
     "Pirate's Tale": "Clip",
     "Mushroom Sorter (Assessment)": "Assessment",
     "Air Show": "Game",
     "Treasure Map": "Clip",
     "Tree Top City - Level 3": "Clip",
     "Crystals Rule": "Game",
     "Rulers": "Clip",
     "Bug Measurer (Activity)": "Activity",
     "Bird Measurer (Assessment)": "Assessment",
     "Bubble Bath": "Game",
     "Bottle Filler (Activity)": "Activity",
     "Dino Dive": "Game",
     "Crystal Caves - Level 1": "Clip",
     "Chow Time": "Game",
     "Cauldron Filler (Assessment)": "Assessment",
     "Balancing Act": "Clip",
     "Crystal Caves - Level 2": "Clip",
     "Crystal Caves - Level 3": "Clip",
     "Chicken Balancer (Activity)": "Activity",
     "Lifting Heavy Things": "Clip",
     "Pan Balance": "Game",
     "Happy Camel": "Game",
     "Honey Cake": "Clip",
     "Cart Balancer (Assessment)": "Assessment",
     "Heavy, Heavier, Heaviest": "Clip",
     "Egg Dropper (Activity)": "Activity",
     "Chest Sorter (Assessment)": "Assessment",
     "Leaf Leader": "Game"
     }

GAMES = [title for title, value in title_to_type.items() if value == "Game"]
ACTIVITIES = [title for title, value in title_to_type.items() if value == "Activity"]
ASSESSMENTS = [title for title, value in title_to_type.items() if value == "Assessment"]
CLIPS = [title for title, value in title_to_type.items() if value == "Clip"]
PLAYABLES = GAMES + ACTIVITIES + ASSESSMENTS

# list of titles with correct:true/false in their event_data
# all games and assessments have correct
ACTIVITIES_WITH_CORRECT = [title for title in ACTIVITIES if title in title_to_event_id_with_correct.keys()]

# event_id with correct t/f
ACTIVITY_EVENT_ID_WITH_CORRECT = []
for activity in ACTIVITIES_WITH_CORRECT:
    for event_id in title_to_event_id_with_correct[activity]:
        ACTIVITY_EVENT_ID_WITH_CORRECT.append(event_id)
        
GAME_EVENT_ID_WITH_CORRECT = []
for game in GAMES:
    for event_id in title_to_event_id_with_correct[game]:
        GAME_EVENT_ID_WITH_CORRECT.append(event_id)
        
ASSESSMENT_EVENT_ID_WITH_CORRECT = []
for assessment in ASSESSMENTS:
    for event_id in title_to_event_id_with_correct[assessment]:
        ASSESSMENT_EVENT_ID_WITH_CORRECT.append(event_id)
        
EVENT_CODES = [
    2000, 3010, 3110, 4070, 4090, 4030, 4035, 4021, 4020, 4010, 2080,
    2083, 2040, 2020, 2030, 3021, 3121, 2050, 3020, 3120, 2060, 2070,
    4031, 4025, 5000, 5010, 2081, 2025, 4022, 2035, 4040, 4100, 2010,
    4110, 4045, 4095, 4220, 2075, 4230, 4235, 4080, 4050
    ]
        
def filter_0_asessment_ids(frame):
    '''
    Filters installation_ids which never took an assessment
    '''
    # Create Boolean column to indicate which rows to keep
    keep = frame[frame.type == 'Assessment'].groupby('installation_id')['type'].count() > 0
    keep = keep[keep].index
    frame['keep'] = frame.installation_id.apply(lambda x: x in keep)

    return frame[frame.keep].drop(columns='keep').reset_index(drop=True)


def aggregate_add_num_title(frame):
    # bit wierd, maybe find another way?
    counts = frame.groupby(['installation_id', 'game_session', 'title'])['event_id'].aggregate(lambda x: 1)
    counts = counts.unstack(fill_value=0).add_prefix('num_')
    counts.columns = counts.columns.str.replace(' ', '_')
    counts.columns.name = None
    
    # adds rest columns
    num_activity_columns = ['num_' + activity.replace(' ', '_') for activity in ACTIVITIES]
    num_game_columns = ['num_' + game.replace(' ', '_') for game in GAMES]
    num_assessment_columns = ['num_' + assessment.replace(' ', '_') for assessment in ASSESSMENTS]
    num_clip_columns = ['num_' + clip.replace(' ', '_') for clip in CLIPS]
    
    num_columns = num_activity_columns + num_game_columns + num_assessment_columns + num_clip_columns
    for column in num_columns:
        if column not in counts.columns:
            counts[column] = 0
    
    # add more sums
    counts['num_activity'] = counts.loc[:, num_activity_columns].sum(axis=1)
    counts['num_game'] = counts.loc[:, num_game_columns].sum(axis=1)
    counts['num_assessment'] = counts.loc[:, num_assessment_columns].sum(axis=1)
    counts['num_clip'] = counts.loc[:, num_clip_columns].sum(axis=1)
    
    counts['num_playable'] = counts.loc[:, ['num_activity', 'num_game', 'num_assessment']].sum(axis=1)
    counts['num_session'] = counts.loc[:, ['num_playable', 'num_clip']].sum(axis=1)
    
    num_activity_with_correct_columns = ['num_' + activity.replace(' ', '_') for activity in ACTIVITIES_WITH_CORRECT]
    counts['num_activity_with_correct'] = counts.loc[:, num_activity_with_correct_columns].sum(axis=1)
    counts['num_playable_with_correct'] = counts.loc[:, ['num_activity_with_correct',
                                                         'num_game', 'num_assessment']].sum(axis=1)
    
    return counts


def aggregate_add_sum_time_played(frame):
    '''
    Produces time_played related aggregates
    '''
    # pivot table
    # clip columns created as collateral
    sums = frame.groupby(['installation_id', 'game_session', 'title'])['time_played'].sum() \
        .unstack(fill_value=0).add_prefix('time_played_')
    sums.columns = sums.columns.str.replace(' ', '_')
    sums.columns.name = None
    
    # adds important columns even if playables are missing
    playable_columns = ['time_played_' + playable.replace(' ', '_') for playable in PLAYABLES]
    for column in playable_columns:
        if column not in sums.columns:
            sums[column] = 0
            
    # add more sum columns
    time_activity_columns = ['time_played_' + activity.replace(' ', '_') for activity in ACTIVITIES]
    time_game_columns = ['time_played_' + game.replace(' ', '_') for game in GAMES]
    time_assessment_columns = ['time_played_' + assessment.replace(' ', '_') for assessment in ASSESSMENTS]
    
    sums['activity_time'] = sums.loc[:, time_activity_columns].sum(axis=1)
    sums['game_time'] = sums.loc[:, time_game_columns].sum(axis=1)
    sums['assessment_time'] = sums.loc[:, time_assessment_columns].sum(axis=1)
    
    return sums


def aggregate_add_num_event_id(frame):
    '''
    frame_path: str 
    specs_path: str
    
    returns:
        returns number of each event_id (columns) for each game_session
        grouped by (installation_id, game_session)
    '''
    event_id_pivot = frame.groupby(['installation_id', 'game_session', 'event_id']).size() \
        .unstack(fill_value=0).add_prefix('num_')
    event_id_pivot.columns.name = None
    
    for _, event_id in EVENT_IDS.items():
        if 'num_' + event_id not in event_id_pivot.columns:
            event_id_pivot['num_' + event_id] = 0
    
    return event_id_pivot


def aggregate_add_correct_true(frame):
    frame['correct_true'] = frame.event_data.str.contains('"(?:correct|launched|jar_filled|filled)":true').astype(int)
    
    correct_true_pivot = frame.loc[frame.event_id.isin(event_ids_with_correct)] \
        .groupby(['installation_id', 'game_session', 'event_id'])['correct_true'].sum().unstack(fill_value=0)
    correct_true_pivot = correct_true_pivot.add_prefix('num_event_id_').add_suffix('_correct')
    correct_true_pivot.columns.name = None
    
    # if column not in cloumns, add zeros
    for event_id in event_ids_with_correct:
        event_id = 'num_event_id_' + event_id + '_correct'
        if event_id not in correct_true_pivot.columns:
            correct_true_pivot[event_id] = 0
            
    # add more columns
    correct_true_pivot['num_playable_correct'] = correct_true_pivot.sum(axis=1)
    correct_true_pivot['num_correct'] = \
        correct_true_pivot.loc[:, ['num_event_id_25fa8af4_correct',
                                   'num_event_id_392e14df_correct',
                                   'num_event_id_93b353f2_correct',
                                   'num_event_id_d122731b_correct',
                                   'num_event_id_17113b36_correct']].sum(axis=1)
    

    num_activity_correct_columns = ['num_event_id_' + event_id + '_correct' for event_id in ACTIVITY_EVENT_ID_WITH_CORRECT]
    num_game_correct_columns = ['num_event_id_' + event_id + '_correct' for event_id in GAME_EVENT_ID_WITH_CORRECT]
    num_assessment_correct_columns = ['num_event_id_' + event_id + '_correct' for event_id in ASSESSMENT_EVENT_ID_WITH_CORRECT]
    
    correct_true_pivot['num_activity_correct'] = correct_true_pivot.loc[:, num_activity_correct_columns].sum(axis=1)
    correct_true_pivot['num_game_correct'] = correct_true_pivot.loc[:, num_game_correct_columns].sum(axis=1)
    correct_true_pivot['num_assessment_correct'] = correct_true_pivot.loc[:, num_assessment_correct_columns].sum(axis=1)
    
    return correct_true_pivot


def aggregate_add_correct_false(frame):
    frame['correct_false'] = frame.event_data.str.contains('"(?:correct|launched|jar_filled|filled)":false').astype(int)
    
    correct_false_pivot = frame.loc[frame.event_id.isin(event_ids_with_correct)] \
        .groupby(['installation_id', 'game_session', 'event_id'])['correct_false'].sum().unstack(fill_value=0)
    correct_false_pivot = correct_false_pivot.add_prefix('num_event_id_').add_suffix('_incorrect')
    correct_false_pivot.columns.name = None
    
    # if column not in cloumns, add zeros
    for event_id in event_ids_with_correct:
        event_id = 'num_event_id_' + event_id + '_incorrect'
        if event_id not in correct_false_pivot.columns:
            correct_false_pivot[event_id] = 0
            
    # add more columns
    correct_false_pivot['num_playable_incorrect'] = correct_false_pivot.sum(axis=1)
    correct_false_pivot['num_incorrect'] = \
        correct_false_pivot.loc[:, ['num_event_id_25fa8af4_incorrect',
                                    'num_event_id_392e14df_incorrect',
                                    'num_event_id_93b353f2_incorrect',
                                    'num_event_id_d122731b_incorrect',
                                    'num_event_id_17113b36_incorrect']].sum(axis=1)
    
    num_activity_incorrect_columns = ['num_event_id_' + event_id + '_incorrect' for event_id in ACTIVITY_EVENT_ID_WITH_CORRECT]
    num_game_incorrect_columns = ['num_event_id_' + event_id + '_incorrect' for event_id in GAME_EVENT_ID_WITH_CORRECT]
    num_assessment_incorrect_columns = ['num_event_id_' + event_id + '_incorrect' for event_id in ASSESSMENT_EVENT_ID_WITH_CORRECT]
        
    correct_false_pivot['num_activity_incorrect'] = correct_false_pivot.loc[:, num_activity_incorrect_columns].sum(axis=1)
    correct_false_pivot['num_game_incorrect'] = correct_false_pivot.loc[:, num_game_incorrect_columns].sum(axis=1)
    correct_false_pivot['num_assessment_incorrect'] = correct_false_pivot.loc[:, num_assessment_incorrect_columns].sum(axis=1)
    
    return correct_false_pivot
    

def aggregate_add_duration(frame):
    frame['duration'] = frame.event_data.str.extract('"duration":(\d*)').fillna(0).astype(int)
    
    sums = frame.groupby(['installation_id', 'game_session', 'event_id'])['duration'].sum().unstack(fill_value=0)
    sums = sums.add_prefix('sum_').add_suffix('_duration')
    sums.columns.name = None

    means = frame.groupby(['installation_id', 'game_session', 'event_id'])['duration'].mean().unstack(fill_value=0)
    means = means.add_prefix('mean_').add_suffix('_duration')
    means.columns.name = None

    # add columns not added
    sum_columns = ['sum_' + event_id + '_duration' for event_id in event_ids_with_duration]
    for col_name in sum_columns:
        if col_name not in sums.columns:
            sums[col_name] = 0
            
    mean_columns = ['mean_' + event_id + '_duration' for event_id in event_ids_with_duration]
    for col_name in mean_columns:
        if col_name not in means.columns:
            means[col_name] = 0
            
    # add more columns
    sums['sum_sum_duration'] = sums.loc[:, sum_columns].sum(axis=1)
    means['sum_mean_duration'] = means.loc[:, mean_columns].sum(axis=1)
    
    return sums.join(means)

def aggregate_add_event_code(frame):
    sums = frame.groupby(['installation_id', 'game_session', 'event_code']).size().unstack(fill_value=0)
    sums = sums.add_prefix('num_')
    sums.columns.name = None
    
    # add columns not added
    for event_code in EVENT_CODES:
        if 'num_' + str(event_code) not in sums.columns:
            sums['num_' + str(event_code)] = 0
    
    return sums

def aggregate_add_squares(aggregate):
    square_orders = {}
    
    # time
    square_orders['activity_time'] = 'square_activity_time'
    square_orders['game_time'] = 'square_game_time'
    square_orders['assessment_time'] = 'square_assessment_time'
    square_orders['time_played'] = 'square_time_played'
    
    # correct t/f
    square_orders['num_activity_correct'] = 'square_num_activity_correct'
    square_orders['num_game_correct'] = 'square_num_game_correct'
    square_orders['num_assessment_correct'] = 'square_num_assessment_correct'
    square_orders['num_playable_correct'] = 'square_num_playable_correct'
    
    square_orders['num_activity_incorrect'] = 'square_num_activity_incorrect'
    square_orders['num_game_incorrect'] = 'square_num_game_incorrect'
    square_orders['num_assessment_incorrect'] = 'square_num_assessment_incorrect'
    square_orders['num_playable_incorrect'] = 'square_num_playable_incorrect'
    
    # duration
    square_orders['sum_sum_duration'] = 'square_sum_sum_duration'
    square_orders['sum_mean_duration'] = 'square_sum_mean_duration'
    
    
    squares = aggregate.loc[:, list(square_orders.keys())] ** 2
    squares = squares.rename(columns=square_orders)
    
    
    return squares


def aggregate_add_cumsums(aggregate):
    cumsum_orders = {}
    
    # num_title
    for title in title_to_type.keys():
        cumsum_orders['num_' + title.replace(' ', '_')] = 'cumsum_' + title.replace(' ', '_')
        
    cumsum_orders['num_activity'] = 'cumsum_activity'
    cumsum_orders['num_game'] = 'cumsum_game'
    cumsum_orders['num_assessment'] = 'cumsum_assessment'
    cumsum_orders['num_clip'] = 'cumsum_clip'
    
    cumsum_orders['num_playable'] = 'cumsum_playable'
    cumsum_orders['num_session'] = 'cumsum_session'
    
    cumsum_orders['num_activity_with_correct'] = 'cumsum_activity_with_correct'
    cumsum_orders['num_playable_with_correct'] = 'cumsum_playable_with_correct'
    
    # time_played
    for title, value in title_to_type.items():
        if value != "Clip":
            cumsum_orders['time_played_' + title.replace(' ', '_')] = 'cumsum_time_played_' + title.replace(' ', '_')

    cumsum_orders['activity_time'] = 'cumsum_activity_time'
    cumsum_orders['game_time'] = 'cumsum_game_time'
    cumsum_orders['assessment_time'] = 'cumsum_assessment_time'
    cumsum_orders['time_played'] = 'cumsum_time_played'
    
    # num_event_ids
    for num_col, cumsum_col in zip(('num_' + EVENT_IDS).tolist(), ('cumsum_' + EVENT_IDS).tolist()):
        cumsum_orders[num_col] = cumsum_col
        
    # num_event_codes
    for event_code in EVENT_CODES:
        cumsum_orders['num_' + str(event_code)] = 'cumsum_' + str(event_code)
        
    # correct t/f
    for event_id in event_ids_with_correct:
        cumsum_orders['num_event_id_' + event_id + '_correct'] = 'cumsum_event_id_' + event_id + '_correct'
        cumsum_orders['num_event_id_' + event_id + '_incorrect'] = 'cumsum_event_id_' + event_id + '_incorrect'
    cumsum_orders['num_playable_correct'] = 'cumsum_playable_correct'
    cumsum_orders['num_playable_incorrect'] = 'cumsum_playable_incorrect'
    
    cumsum_orders['num_activity_correct'] = 'cumsum_activity_correct'
    cumsum_orders['num_game_correct'] = 'cumsum_game_correct'
    cumsum_orders['num_assessment_correct'] = 'cumsum_assessment_correct'

    cumsum_orders['num_activity_incorrect'] = 'cumsum_activity_incorrect'
    cumsum_orders['num_game_incorrect'] = 'cumsum_game_incorrect'
    cumsum_orders['num_assessment_incorrect'] = 'cumsum_assessment_incorrect'

    # duration
    for title, event_ids in title_to_event_id_with_duration.items():
        for event_id in event_ids:
            cumsum_orders['sum_' + event_id + '_duration'] = 'cumsum_sum_' + event_id + '_duration'
            cumsum_orders['mean_' + event_id + '_duration'] = 'cumsum_mean_' + event_id + '_duration'
    
    cumsum_orders['sum_sum_duration'] = 'cumsum_sum_duration'
    cumsum_orders['sum_mean_duration'] = 'cumsum_mean_duration'


    # squares
    # time
    cumsum_orders['square_activity_time'] = 'cumsum_square_activity_time'
    cumsum_orders['square_game_time'] = 'cumsum_square_game_time'
    cumsum_orders['square_assessment_time'] = 'cumsum_square_assessment_time'
    cumsum_orders['square_time_played'] = 'cumsum_square_time_played'
    
    # correct t/f
    cumsum_orders['square_num_activity_correct'] = 'cumsum_square_num_activity_correct'
    cumsum_orders['square_num_game_correct'] = 'cumsum_square_num_game_correct'
    cumsum_orders['square_num_assessment_correct'] = 'cumsum_square_num_assessment_correct'
    cumsum_orders['square_num_playable_correct'] = 'cumsum_square_num_playable_correct'
    
    cumsum_orders['square_num_activity_incorrect'] = 'cumsum_square_num_activity_incorrect'
    cumsum_orders['square_num_game_incorrect'] = 'cumsum_square_num_game_incorrect'
    cumsum_orders['square_num_assessment_incorrect'] = 'cumsum_square_num_assessment_incorrect'
    cumsum_orders['square_num_playable_incorrect'] = 'cumsum_square_num_playable_incorrect'
    
    # duration
    cumsum_orders['square_sum_sum_duration'] = 'cumsum_square_sum_sum_duration'
    cumsum_orders['square_sum_mean_duration'] = 'cumsum_square_sum_mean_duration'
    
    # actual function 
    cumsums = aggregate.loc[:, list(cumsum_orders.keys())].groupby('installation_id').cumsum(axis=0)
    cumsums = cumsums.rename(columns=cumsum_orders)
    
    return cumsums


def get_aggregate(frame_path):
    '''
    frame: Pandas.DataFrame
    event_ids: Pandas.Series 
    
    returns: Pandas.DataFrame, useful attributes from frame 
        grouped by ('installation_id', 'game_session') 
        sorted by ('installation_id', 'last_timestamp')
    '''
    # read file
    relevant_columns = ['event_id', 'game_session', 'timestamp', 'event_data', 'installation_id', 
                    'game_time', 'event_code', 'title', 'type', 'world']
    
    frame = pd.read_csv(frame_path, usecols=relevant_columns)
    
    # filter installation_ids which never took an assessment
    frame = filter_0_asessment_ids(frame)
    
    # create aggregate dataframe
    aggregate = frame.groupby(['installation_id', 'game_session'])[['timestamp']].max() \
        .sort_values(by=['installation_id', 'timestamp']).rename(columns={'timestamp': 'last_timestamp'})
    
    # add columns
    grouped = frame.groupby(['installation_id', 'game_session'])
    
    aggregate['title'] = grouped[['title']].first()
    aggregate['type'] = grouped[['type']].first()
    aggregate['world'] = grouped[['world']].last()
    
    # add num_title columns
    aggregate = aggregate.join(aggregate_add_num_title(frame))

    # add time_played columns
    aggregate['time_played'] = grouped[['game_time']].max()
    aggregate = aggregate.join(aggregate_add_sum_time_played(aggregate))
    
    # add num_event_id columns
    aggregate = aggregate.join(aggregate_add_num_event_id(frame))
    
    # add num_event_code columns
    aggregate = aggregate.join(aggregate_add_event_code(frame))
    
    # add correct true/false pivots
    aggregate = aggregate.join(aggregate_add_correct_true(frame))
    aggregate = aggregate.join(aggregate_add_correct_false(frame))
    
    # duration
    aggregate = aggregate.join(aggregate_add_duration(frame))
    
    # delete frame to save RAM
    del frame
    gc.collect()
    
    # fill NaN
    aggregate = aggregate.fillna(0)
    
    # squares
    aggregate = aggregate.join(aggregate_add_squares(aggregate))
    
    # cumsums
    aggregate = aggregate.join(aggregate_add_cumsums(aggregate))
    
    
    return aggregate

        
class SliceIndex:
    def __init__(self, first_idx, last_idx, feature_idx):
        self.first_idx = first_idx
        self.last_idx = last_idx
        self.feature_idx = feature_idx
        
        # allowed is to avoid leaks
        self.allowed_idx = self.calculate_allowed_idx()
        
    def get_first_idx(self):
        return self.first_idx
    
    def get_last_idx(self):
        return self.last_idx
    
    def get_feature_idx(self):
        return self.feature_idx
    
    def get_allowed_idx(self):
        return self.allowed_idx

    def calculate_allowed_idx(self):
        return self.last_idx - 1 
    
    def is_valid(self):
        '''
        Slice is valid if installation_id has at least one more title before last_idx
        '''
        return self.first_idx < self.last_idx
    

def aggregate_to_features(aggregate):
    
    aggregate = aggregate.reset_index()
    
    data = {}
    
    
    columns = ['installation_id',
               'next_assessment_session',
               'next_assessment_title',
               'assessment_world',
               'num_correct',
               'num_incorrect'
            ]
    

    # initialize dict values
    for column in columns:
        data[column] = []
        
    valid_slice_indices = []
    feature_idx = 0
    
    for installation_id in list(aggregate.installation_id.unique()):
        # the part of aggregate related with the specific install_id
        aggregate_id = aggregate.loc[aggregate.installation_id == installation_id]
        
        first_idx = aggregate_id.index.tolist()[0]
        assessment_indices = aggregate_id.loc[aggregate_id.type == 'Assessment'].index.tolist()
        # for each assessment (index) create features
        for assess_idx in assessment_indices:
            
            current_slice = SliceIndex(first_idx, assess_idx, feature_idx)
            feature_idx += 1
            
            data['installation_id'].append(installation_id)
            data['next_assessment_session'].append(aggregate_id.loc[current_slice.get_last_idx(), 'game_session'])
            data['next_assessment_title'].append(aggregate_id.loc[current_slice.get_last_idx(), 'title'])
            data['assessment_world'].append(aggregate_id.loc[current_slice.get_last_idx(), 'world'])
            
            data['num_correct'].append(aggregate_id.loc[current_slice.get_last_idx(), 'num_correct'])
            data['num_incorrect'].append(aggregate_id.loc[current_slice.get_last_idx(), 'num_incorrect'])
            
            if current_slice.is_valid():
                valid_slice_indices.append(current_slice)
            
        
    features = pd.DataFrame(data)
    

    feature_orders = {
            # num_type
            'cumsum_activity': 'num_activity',
            'cumsum_game': 'num_game',
            'cumsum_assessment': 'num_assessment',
            'cumsum_clip': 'num_clip',
            
            'cumsum_playable': 'num_playable',
            'cumsum_session': 'num_session',
            
            'cumsum_activity_with_correct': 'num_activity_with_correct',
            'cumsum_playable_with_correct': 'num_playable_with_correct',
            
            # time
            'cumsum_time_played': 'time_played',
            'cumsum_square_time_played': 'square_time_played',
            'cumsum_activity_time': 'sum_activity_time',
            'cumsum_square_activity_time': 'sum_square_activity_time',
            'cumsum_game_time': 'sum_game_time',
            'cumsum_square_game_time': 'sum_square_game_time',
            'cumsum_assessment_time': 'sum_assessment_time',
            'cumsum_square_assessment_time': 'sum_square_assessment_time',
            
            # correct t/f
            'cumsum_activity_correct': 'sum_activity_correct',
            'cumsum_game_correct': 'sum_game_correct',
            'cumsum_assessment_correct': 'sum_assessment_correct',
            'cumsum_playable_correct': 'sum_playable_correct',
            
            'cumsum_square_num_activity_correct': 'sum_square_activity_correct',
            'cumsum_square_num_game_correct': 'sum_square_game_correct',
            'cumsum_square_num_assessment_correct': 'sum_square_assessment_correct',
            'cumsum_square_num_playable_correct': 'sum_square_playable_correct',
            
            'cumsum_activity_incorrect': 'sum_activity_incorrect',
            'cumsum_game_incorrect': 'sum_game_incorrect',
            'cumsum_assessment_incorrect': 'sum_assessment_incorrect',
            'cumsum_playable_incorrect': 'sum_playable_incorrect',
            
            'cumsum_square_num_activity_incorrect': 'sum_square_activity_incorrect',
            'cumsum_square_num_game_incorrect': 'sum_square_game_incorrect',
            'cumsum_square_num_assessment_incorrect': 'sum_square_assessment_incorrect',
            'cumsum_square_num_playable_incorrect': 'sum_square_playable_incorrect',
            
            # duration
            'cumsum_sum_duration': 'sum_sum_duration',
            'cumsum_mean_duration': 'sum_mean_duration',
            
            'cumsum_square_sum_sum_duration': 'sum_square_sum_sum_duration',
            'cumsum_square_sum_mean_duration': 'sum_square_sum_mean_duration'
            
            }
    
    # num title
    for title in title_to_type.keys():
        feature_orders['cumsum_' + title.replace(' ', '_')] = 'num_' + title.replace(' ', '_')
    
    # time
    for playable in PLAYABLES:
        feature_orders['cumsum_time_played_' + playable.replace(' ', '_')] = 'time_played_' + playable.replace(' ', '_')
    
    # sum event id
    for _, event_id in EVENT_IDS.items():
        feature_orders['cumsum_' + event_id] = 'sum_' + event_id
        
    # sum event code
    for event_code in EVENT_CODES:
        feature_orders['cumsum_' + str(event_code)] = 'sum_' + str(event_code)
        
    # sum event id correct
    for title, event_ids in title_to_event_id_with_correct.items():
        for event_id in event_ids:
            feature_orders['cumsum_event_id_' + event_id + '_correct'] = 'sum_event_id_' + event_id + '_correct'
            feature_orders['cumsum_event_id_' + event_id + '_incorrect'] = 'sum_event_id_' + event_id + '_incorrect'

    # duration
    for title, event_ids in title_to_event_id_with_duration.items():
        for event_id in event_ids:
            feature_orders['cumsum_sum_' + event_id + '_duration'] = 'sum_sum_' + event_id + '_duration'
            feature_orders['cumsum_mean_' + event_id + '_duration'] = 'sum_mean_' + event_id + '_duration'

            
    allowed_indices = list(map(lambda x: x.get_allowed_idx(), valid_slice_indices))
    feature_indices = list(map(lambda x: x.get_feature_idx(), valid_slice_indices))
    
    aggregate_columns = list(feature_orders.keys())
    feature_columns = list(map(lambda x: feature_orders[x], aggregate_columns))
    
    allowed_data = aggregate.loc[allowed_indices, aggregate_columns]
    allowed_data.index = pd.Index(feature_indices)
    allowed_data.columns = pd.Index(feature_columns)
    
    features = features.join(allowed_data)
    
    features.loc[:, feature_columns] = features.loc[:, feature_columns].fillna(0)

    features['accuracy'] = \
        features.loc[:, 'num_correct'] / (features.loc[:, 'num_correct'] + features.loc[:, 'num_incorrect'])
    features['accuracy'] = features.loc[:, 'accuracy'].fillna(value=0)
    
    features['accuracy_group'] = \
        features.loc[:, ['num_correct', 'num_incorrect']].apply(calculate_accuracy_group, axis=1)

    
    # mean times
    features['mean_activity_time'] = (features.loc[:, 'sum_activity_time'] / features.loc[:, 'num_activity']).fillna(value=0)
    features['mean_game_time'] = (features.loc[:, 'sum_game_time'] / features.loc[:, 'num_game']).fillna(value=0)
    features['mean_assessment_time'] = (features.loc[:, 'sum_assessment_time'] / features.loc[:, 'num_assessment']).fillna(value=0)
    
    features['mean_time_played'] = (features.loc[:, 'time_played'] / features.loc[:, 'num_playable']).fillna(value=0)
    features['wierd_mean_time_played'] = (features.loc[:, 'time_played'] / features.loc[:, 'num_session']).fillna(value=0)
    
    # std times
    features['std_activity_time'] = \
        features.loc[:, ['num_activity', 'sum_activity_time', 'sum_square_activity_time']].apply(std_from_cumsums, axis=1)
    features['std_game_time'] = \
        features.loc[:, ['num_game', 'sum_game_time', 'sum_square_game_time']].apply(std_from_cumsums, axis=1)
    features['std_assessment_time'] = \
        features.loc[:, ['num_assessment', 'sum_assessment_time', 'sum_square_assessment_time']].apply(std_from_cumsums, axis=1)
    features['std_time_played'] = \
        features.loc[:, ['num_playable', 'time_played', 'square_time_played']].apply(std_from_cumsums, axis=1)

    # mean activity correct/incorrect
    features['mean_activity_correct'] = (features.loc[:, 'sum_activity_correct'] / features.loc[:, 'num_activity_with_correct']).fillna(value=0)
    features['mean_activity_incorrect'] = (features.loc[:, 'sum_activity_incorrect'] / features.loc[:, 'num_activity_with_correct']).fillna(value=0)
    
    # std activity correct/incorrect
    features['std_activity_correct'] = \
        features.loc[:, ['num_activity_with_correct', 'sum_activity_correct', 'sum_square_activity_correct']].apply(std_from_cumsums, axis=1)
    features['std_activity_incorrect'] = \
        features.loc[:, ['num_activity_with_correct', 'sum_activity_incorrect', 'sum_square_activity_incorrect']].apply(std_from_cumsums, axis=1)
    
    # mean game correct/incorrect
    features['mean_game_correct'] = (features.loc[:, 'sum_game_correct'] / features.loc[:, 'num_game']).fillna(value=0)
    features['mean_game_incorrect'] = (features.loc[:, 'sum_game_incorrect'] / features.loc[:, 'num_game']).fillna(value=0)
    
    # std game correct/incorrect
    features['std_game_correct'] = \
        features.loc[:, ['num_game', 'sum_game_correct', 'sum_square_game_correct']].apply(std_from_cumsums, axis=1)
    features['std_game_incorrect'] = \
        features.loc[:, ['num_game', 'sum_game_incorrect', 'sum_square_game_incorrect']].apply(std_from_cumsums, axis=1)
    
    # mean assessment correct/incorrect
    features['mean_assessment_correct'] = (features.loc[:, 'sum_assessment_correct'] / features['num_assessment']).fillna(value=0)
    features['mean_assessment_incorrect'] = (features.loc[:, 'sum_assessment_incorrect'] / features['num_assessment']).fillna(value=0)
    
    # std assessment correct/incorrect
    features['std_assessment_correct'] = \
        features.loc[:, ['num_assessment', 'sum_assessment_correct', 'sum_square_assessment_correct']].apply(std_from_cumsums, axis=1)
    features['std_assessment_incorrect'] = \
        features.loc[:, ['num_assessment', 'sum_assessment_incorrect', 'sum_square_assessment_incorrect']].apply(std_from_cumsums, axis=1)
    
    # mean playable correct/incorrect
    features['mean_playable_correct'] = (features.loc[:, 'sum_playable_correct'] / features.loc[:, 'num_playable_with_correct']).fillna(value=0)
    features['mean_playable_incorrect'] = (features.loc[:, 'sum_playable_incorrect'] / features.loc[:, 'num_playable_with_correct']).fillna(value=0)
    
    # std playable correct/incorrect
    features['std_playable_correct'] = \
        features.loc[:, ['num_playable', 'sum_playable_correct', 'sum_square_playable_correct']].apply(std_from_cumsums, axis=1)
    features['std_playable_incorrect'] = \
        features.loc[:, ['num_playable', 'sum_playable_incorrect', 'sum_square_playable_incorrect']].apply(std_from_cumsums, axis=1)
        
    # duration
    features['mean_sum_duration'] = (features.loc[:, 'sum_sum_duration'] / features['num_playable']).fillna(value=0)
    features['mean_mean_duration'] = (features.loc[:, 'sum_mean_duration'] / features['num_playable']).fillna(value=0)
        
    features['std_sum_duration'] = \
        features.loc[:, ['num_playable', 'sum_sum_duration', 'sum_square_sum_sum_duration']].apply(std_from_cumsums, axis=1)
    features['std_mean_duration'] = \
        features.loc[:, ['num_playable', 'sum_mean_duration', 'sum_square_sum_mean_duration']].apply(std_from_cumsums, axis=1)
    
    features = features.reindex(sorted(features.columns), axis=1)
    
    return features

# column encoding
def one_hot_encode_column(frame_train, frame_test, column_name):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    oh_columns_train = pd.DataFrame(encoder.fit_transform(frame_train[[column_name]]))
    oh_columns_train.index = frame_train.index
    oh_columns_train = oh_columns_train.add_prefix(column_name + '_')
    
    oh_columns_test = pd.DataFrame(encoder.transform(frame_test[[column_name]]))
    oh_columns_test.index = frame_test.index
    oh_columns_test = oh_columns_test.add_prefix(column_name + '_')
    
    frame_train = frame_train.drop([column_name], axis=1)
    frame_test = frame_test.drop([column_name], axis=1)
    
    frame_train = pd.concat([frame_train, oh_columns_train], axis=1)
    frame_test = pd.concat([frame_test, oh_columns_test], axis=1)

    return frame_train, frame_test, encoder


def label_encode_column(frame_train, frame_test, column_name):
    encoder = LabelEncoder()
    
    frame_train[column_name] = encoder.fit_transform(frame_train[column_name])
    frame_test[column_name] = encoder.transform(frame_test[column_name])
    
    return frame_train, frame_test, encoder

# get features

def get_features(frame_path):
    '''
    frame: Pandas.DataFrame
    event_ids: Pandas.Series 
    
    returns: Pandas.DataFrame
    '''
    frame = get_aggregate(frame_path)
    frame = aggregate_to_features(frame)
    
    return frame


def get_features_test(frame_path):
    '''
    frame: Pandas.DataFrame
    event_ids: Pandas.Series 
    
    returns: Pandas.DataFrame
    '''
    frame = get_features(frame_path)
    test_train = frame[frame.groupby('installation_id').cumcount(ascending=False) > 0]
    test_pred = frame.groupby('installation_id').tail(1)
    
    return test_train, test_pred


def get_features_train(frame_path):
    '''
    frame: Pandas.DataFrame
    event_ids: Pandas.Series 
    
    returns: Pandas.DataFrame
    '''
    frame = get_features(frame_path)
    
    return frame

# feature selection

def get_train_test(train_features, test_features, selected_feature_columns, selected_target_columns):
    '''
    train_features: pandas.DataFrame
    test_features: pandas.DataFrame
    selected_feature_columns: str or array-like
    selected_target_columns: str or array-like
    
    '''
    X_train = train_features.loc[:, selected_feature_columns]
    X_test = test_features.loc[:, selected_feature_columns]
    
    y_train = train_features.loc[:, selected_target_columns]
    
    return X_train, X_test, y_train


In [None]:
train_features = get_features_train(TRAIN_CSV_PATH)
gc.collect()

print(train_features.shape)
train_features.head()

In [None]:
test_train_features, test_features = get_features_test(TEST_CSV_PATH)
gc.collect()

print(test_features.shape)
test_features.head()

In [None]:
# concatenate test_train_features to train_features
train_features = pd.concat([train_features, test_train_features], ignore_index=True)

print(train_features.shape)
train_features.head()

In [None]:
# encode categorical columns
train_features, test_features, _ = one_hot_encode_column(train_features, test_features, 'next_assessment_title')
train_features, test_features, _ = label_encode_column(train_features, test_features, 'assessment_world')

test_features.head()

In [None]:
feature_cols = test_features.columns.drop(['installation_id', 
                                           'next_assessment_session', # not feature
                                           'num_correct', # prevent leaks
                                           'num_incorrect', 
                                           'accuracy', 
                                           'accuracy_group'])

len(feature_cols)

In [None]:
event_code_columns = ['sum_' + str(event_code) for event_code in EVENT_CODES]

selected_feature_columns = list(feature_cols.drop(event_code_columns).drop('wierd_mean_time_played'))
len(selected_feature_columns)

In [None]:
X_train, X_test, y_train = get_train_test(train_features, 
                                          test_features, 
                                          selected_feature_columns, 
                                          'accuracy_group')

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
X_train.head()

In [None]:
def float_to_accuracy_group(y):
    y[y <= 0.9935] = 0
    y[(y > 0.9935) & (y <= 1.488)] = 1
    y[(y > 1.488) & (y <= 2.077)] = 2
    y[y > 2.077] = 3
    
    return y


def lgbm_regression_kappa(y_true, y_pred):
    y_pred = float_to_accuracy_group(y_pred)
    
    return 'kappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True


def get_regressors_cv(X, y, n_splits, n_seeds):
    models = []
    scores = []
    
    params = {'n_estimators': 2000,
              'n_jobs':-1}
    
    for i in range(n_seeds):
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=(11 * i) ** 2)
        for train_index, valid_index in kf.split(X, y):
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

            model = LGBMRegressor(**params)

            model.fit(X=X_train, 
                      y=y_train,
                      eval_names=['validation'],
                      eval_set=[(X_valid, y_valid)],
                      eval_metric=lgbm_regression_kappa,
                      early_stopping_rounds=200,
                      verbose=-1
                     )

            models.append(model)
            scores.append(model.best_score_['validation']['kappa'])
    
    print('mean validation score:', np.array(scores).mean(), '+/-', np.array(scores).std(), 'std')
    
    return models, scores


def models_weighted_predict(models, weights, X):
    y_pred = np.zeros(X.shape[0])
    
    for i, model in enumerate(models):
        y_pred += weights[i] * model.predict(X)
        
    return y_pred / sum(weights)

In [None]:
lgbm_models, lgbm_scores = get_regressors_cv(X_train, y_train, 5, 5)

In [None]:
y_pred = models_weighted_predict(lgbm_models, lgbm_scores, X_test)
test_features['accuracy_group'] = float_to_accuracy_group(y_pred).astype(int)
submission = test_features[['installation_id', 'accuracy_group']]

print(submission.shape)
submission.head()

In [None]:
submission['accuracy_group'].value_counts().plot(kind='barh')

In [None]:
submission.to_csv('submission.csv', index=False)