In [None]:
import pandas as pd
import numpy as np
import os
import time
import copy
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold
import gc
import json
import lightgbm as lgb
import seaborn as sns
from functools import partial
import xgboost as xgb
import scipy as sp
from numba import jit
pd.set_option('display.max_columns', 1000)
import matplotlib.pyplot as plt
import random
kaggle=True
if kaggle:
    dirs="/kaggle/input/data-science-bowl-2019/"
else:
    dirs="./"

In [None]:
import random
import os
def SeedEverything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    return
SeedEverything(1993)

In [None]:
DEBUG = False

In [None]:
def read_data():
    if DEBUG:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv', nrows=100000)
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv', nrows=100000)
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))
    else:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv')
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv')
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv(dirs+'train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv(dirs+'specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv(dirs+'sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    
    result=[]
    for event_id,df in train.groupby('event_id'):
        if event_id=='27253bdc':
            result.append({'event_id':event_id,'event_code':df['event_code'].iloc[0],'title':"ALL Clips",'type':"Clip",'world':"ALL worlds"})
        else:
            result.append({'event_id':event_id,'event_code':df['event_code'].iloc[0],'title':df['title'].iloc[0],'type':df['type'].iloc[0],'world':df['world'].iloc[0]})
    title_code=pd.DataFrame(result)
    specs=pd.merge(specs,title_code,how='outer',right_on='event_id',left_on='event_id')   
    return train, test, train_labels, specs, sample_submission

train, test, train_labels, specs, sample_submission = read_data()

In [None]:
import pandas as pd
media_sequence = pd.read_csv("../input/dsb2019-external-data/media_sequence.csv")

In [None]:
len(media_sequence)

In [None]:
train = train[train.installation_id.isin(train_labels.installation_id.unique())].reset_index(drop=True)
train.shape
specs2 = pd.read_csv(dirs+'specs.csv')

list_of_event_args = list(set(specs2['args'].unique()))
event_args_map = dict(zip(list_of_event_args, np.arange(len(list_of_event_args))))
specs2["args"]=specs2["args"].map(event_args_map)

list_of_event_info = list(set(specs2['info'].unique()))
event_info_map = dict(zip(list_of_event_info, np.arange(len(list_of_event_info))))
specs2["info"]=specs2["info"].map(event_info_map)

args_list=specs2["args"].value_counts().add_prefix('args_').index.tolist()
args_label=dict(zip(np.arange(len(args_list)), args_list))
info_list=specs2["info"].value_counts().add_prefix('info_').index.tolist()
info_label=dict(zip(np.arange(len(info_list)), info_list))

train=pd.merge(train,specs2,on=["event_id"],how="left")
test=pd.merge(test,specs2,on=["event_id"],how="left")

In [None]:
def encode_title(train, test):
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title']))
    list_of_event_code = list(set(train['event_code']))
    list_of_event_id = list(set(train['event_id']))
    list_of_worlds = list(set(train['world']))
    
    list_of_user_activities=sorted(list_of_user_activities)
    list_of_event_code=sorted(list_of_event_code)
    list_of_event_id=sorted(list_of_event_id)
    list_of_worlds=sorted(list_of_worlds)
    # create a dictionary numerating the titles
    title_enc= dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    world_enc = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    assess_titles = sorted(assess_titles)
    
    win_code = dict(zip(list_of_user_activities, (4100*np.ones(len(list_of_user_activities))).astype('int')))
    win_code['Bird Measurer (Assessment)'] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    return train, test,win_code, list_of_user_activities, list_of_event_code, list_of_event_id,list_of_worlds,title_enc,world_enc,assess_titles

train, test,win_code, list_of_user_activities, list_of_event_code, list_of_event_id,list_of_worlds,title_enc,world_enc,assess_titles=encode_title(train,test)

| code | argument | method_in_session | method_between_session | 备注 | 含义 |  
| --- | ---| --- | ---| --- | --- |
|2030|misses|ema|ema|minmaxscale|beatround
|2030|round|max|ema|minmaxscale|
|2030|duration|ema|ema|minmaxscale|
|4020/4025|correct|count| ema| norm|一次操作
|4020/4025|round|max|ema|norm|
|4100/4110|correct|count|
|4080|duration|ema|ema|norm|
|4040|duration|ema|ema|norm|一次拖动

dict:  event_id:[argument1,argument2,....]

In [None]:
code=[2030,2030,2030,(4020,4025),(4020,4025),(4100,4110),(4040)]
argument=['"misses"','"round"','"duration"','"correct"','"round"','"correct"','"duration"','"duration"']
method_in_session=['ema','ema','ema','count_true','count']

interesting_args=[
    [2030,'"misses"',"ema","ema"],
    [2030,'"round"',"max","ema"],
    [2030,'"duration"',"ema","ema"],
    [(4020,4025),'"correct"',"count_true","ema"],
    [(4020,4025),'"correct"',"count_false","ema"],
    [(4020,4025),'"round"',"max","ema"],
    [(4100,4110),'"correct"',"count_true","ema"],
    [(4100,4110),'"correct"',"count_false","ema"],
    [4040,'"duration"',"ema","ema"]
]




useful_codes={2030,4020,4025,4040,4100,4110}
def get_event_data_dict(specs,interesting_args,useful_codes):
    event_id_to_args={}
    key_set=set()
    for i in range(len(specs)):
        args_event_id=[]
        row=specs.iloc[i]
        event_code=row['event_code']
        event_id=row['event_id']
        title=row['title']
        if event_code not in useful_codes:
            continue
        collect_args=[]
        for arg in interesting_args:
            code_match=event_code in arg[0] if type(arg[0])==tuple else event_code==arg[0]
            if code_match and arg[1] in row['args']:
                key=title+'_'.join([str(c) for c in arg[0]]) if type(arg[0])==tuple else title+'_'+str(arg[0])
                key+='_'+arg[1][1:-1]+'_'+arg[2]+'_'+arg[3]
                collect_args.append([arg[1][1:-1],key,arg[2],arg[3]])
                #print("ID {} code {} add {} to key {}, methods {} {}".format(event_id,event_code,arg[1][1:-1],key,arg[2],arg[3]))
                key_set.add(key)
            else:
                pass
                #print("ID {} code {} does not have {}".format(event_id,event_code,arg[1][1:-1]))
        if collect_args:
            event_id_to_args[event_id]=collect_args
    return event_id_to_args,key_set
event_id_to_args,key_set=get_event_data_dict(specs,interesting_args,useful_codes)
print(len(key_set))
key_set

In [None]:
ema_momentum_in_session=0.75
ema_momentum_between_session=0.75
class data_logger(object):
    def __init__(self,event_id_to_args):
        self.event_id_to_args=event_id_to_args
        self.installation_status={}
    
    def log_data(self,session):
        session_status={}
        update_method={}
        do_count_ids=set()
        for event_id,data_str in zip(session['event_id'],session['event_data']): 
            args=self.event_id_to_args.get(event_id)
            if args is None:
                continue
            event_data=json.loads(data_str)
            for arg_pair in args:
                if arg_pair[2]=='count_true' or arg_pair[2]=='count_false':
                    do_count_ids.add(event_id)
                    continue
                new_val=event_data[arg_pair[0]]
                if arg_pair[1] not in session_status:
                    if arg_pair[2]=='mean':
                        session_status[arg_pair[1]]=[]
                    else:
                        session_status[arg_pair[1]]=new_val
                else:
                    if arg_pair[2]=="ema":
                        session_status[arg_pair[1]]=ema_momentum_in_session*session_status[arg_pair[1]]+(1-ema_momentum_in_session)*new_val
                    elif arg_pair[2]=='max':
                        session_status[arg_pair[1]]=max(session_status[arg_pair[1]],new_val)
                    elif arg_pair[2]=='min':
                        session_status[arg_pair[1]]=min(session_status[arg_pair[1]],new_val)
                    elif arg_pair[2]=='sum':
                        session_status[arg_pair[1]]=session_status[arg_pair[1]]+new_val
                    elif arg_pair[2]=='mean':
                        session_status[arg_pair[1]].append(new_val)
                    else:
                        raise NotImplementedError
                        
                update_method[arg_pair[1]]=arg_pair[3]  #inter session method
        #count true false
        for count_id in do_count_ids:
            target_df=session[session.event_id==count_id]
            args=self.event_id_to_args.get(count_id)
            num_true=0
            num_false=0
            save_key=""
            for arg_pair in args:
                if arg_pair[2]=='count_true':
                    num_true=target_df['event_data'].str.contains('true').sum()
                    session_status[arg_pair[1]]=num_true
                    update_method[arg_pair[1]]=arg_pair[3]
                    save_key=arg_pair[1]
                elif arg_pair[2]=='count_false':
                    num_false=target_df['event_data'].str.contains('false').sum()    
                    session_status[arg_pair[1]]=num_false
                    update_method[arg_pair[1]]=arg_pair[3]
            session_status['_'.join(save_key.split("_")[:-2]+['accuracy'])]=num_true/(num_false+num_true)
            update_method['_'.join(save_key.split("_")[:-2]+['accuracy'])]="ema"
            
        #update installation_status
        for key in session_status.keys():
            if type(session_status[key]) is list:
                session_status[key]=np.mean(session_status[key])
            if key not in self.installation_status:
                if update_method[key]=='mean':
                    self.installation_status[key]=[]
                else:
                    self.installation_status[key]=session_status[key]
            else:    
                if update_method[key]=='ema':
                    self.installation_status[key]=ema_momentum_between_session*self.installation_status[key]+\
                                                    (1-ema_momentum_between_session)*session_status[key]
                elif update_method[key]=='max':
                    self.installation_status[key]=max(self.installation_status[key],session_status[key])
                elif update_method[key]=='min':
                    self.installation_status[key]=min(self.installation_status[key],session_status[key])
                elif update_method[key]=='sum':
                    self.installation_status[key]=self.installation_status[key]+session_status[key]
                elif update_method[key]=='mean':
                    self.installation_status[key].append(session_status[key])
                else:
                    raise NotImplementedError
             
    def get_data(self):
        
        return {key: self.installation_status[key] if type(self.installation_status[key]) is not list else np.mean(self.installation_status[key]) \
                for key in self.installation_status}

In [None]:
tree_top_city = {
    "Tree Top City - Level 1": 1,
    "Ordering Spheres": 2,
    "All Star Sorting": 3,
    "Costume Box": 4,
    "Fireworks (Activity)": 5,
    "12 Monkeys": 6,
    "Tree Top City - Level 2": 7,
    "Flower Waterer (Activity)": 8,
    "Pirate's Tale": 9,
    "Mushroom Sorter (Assessment)": 10,
    "Air Show": 11,
    "Treasure Map": 12,
    "Tree Top City - Level 3": 13,
    "Crystals Rule": 14,
    "Rulers": 15,
    "Bug Measurer (Activity)": 16,
    "Bird Measurer (Assessment)": 17,
}
magma_peak = {
    "Magma Peak - Level 1": 1,
    "Sandcastle Builder (Activity)": 2,
    "Slop Problem": 3,
    "Scrub-A-Dub": 4,
    "Watering Hole (Activity)": 5,
    "Magma Peak - Level 2": 6,
    "Dino Drink": 7,
    "Bubble Bath": 8,
    "Bottle Filler (Activity)": 9,
    "Dino Dive": 10,
    "Cauldron Filler (Assessment)": 11,
}
crystal_caves = {
    "Crystal Caves - Level 1": 1,
    "Chow Time": 2,
    "Balancing Act": 3,
    "Chicken Balancer (Activity)": 4,
    "Lifting Heavy Things": 5,
    "Crystal Caves - Level 2": 6,
    "Honey Cake": 7,
    "Happy Camel": 8,
    "Cart Balancer (Assessment)": 9,
    "Leaf Leader": 10,
    "Crystal Caves - Level 3": 11,
    "Heavy, Heavier, Heaviest": 12,
    "Pan Balance": 13,
    "Egg Dropper (Activity)": 14,
    "Chest Sorter (Assessment)": 15,
}

In [None]:
clip_time = {'Welcome to Lost Lagoon!':19,'Tree Top City - Level 1':17,'Ordering Spheres':61, 'Costume Box':61,
        '12 Monkeys':109,'Tree Top City - Level 2':25, 'Pirate\'s Tale':80, 'Treasure Map':156,'Tree Top City - Level 3':26,
        'Rulers':126, 'Magma Peak - Level 1':20, 'Slop Problem':60, 'Magma Peak - Level 2':22, 'Crystal Caves - Level 1':18,
        'Balancing Act':72, 'Lifting Heavy Things':118,'Crystal Caves - Level 2':24, 'Honey Cake':142, 'Crystal Caves - Level 3':19,
        'Heavy, Heavier, Heaviest':61}

In [None]:
def get_data(user_sample, test_set=False):
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    accuracy_groups = {"acc_group_0":0, "acc_group_1":0, "acc_group_2":0, "acc_group_3":0}
    game_time_dict = {'Clip_gametime':0, 'Game_gametime':0, 'Activity_gametime':0, 'Assessment_gametime':0}
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    
    last_world='NONE'
    last_activity_type='Clip'
    time_last_activity=None
    
    give_up={"give_up_"+assess:0 for assess in assess_titles}
    
    durations ={'ema_duration_'+eve:0 for eve in list_of_user_activities}
    
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    
    world_count={world:0 for world in list_of_worlds}
    event_code_count = {ev: 0 for ev in list_of_event_code}
    event_id_count = {eve: 0 for eve in list_of_event_id}
    title_count = {eve: 0 for eve in list_of_user_activities} 
    counter = 0
    assess_durations=[]
    tree_top_city_list = []
    magma_peak_list = []
    crystal_caves_list = []
    clip_durations = []

    installation_logger=data_logger(event_id_to_args)
    
    all_assessments = []
    for session_id, session in user_sample.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_world = session['world'].iloc[0]
        
        if session_type == 'Clip':
            clip_durations.append((clip_time[session_title]))
        
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            features={}
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_id'] = session_id
            features['session_title'] = session_title
            features['session_world'] = session_world

            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups.copy())
            accuracy_groups["acc_group_"+str(features['accuracy_group'])] += 1   
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            features['ratio']=accumulated_correct_attempts/(accumulated_correct_attempts+accumulated_uncorrect_attempts) if (accumulated_correct_attempts+accumulated_uncorrect_attempts)!=0 else 0
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
    
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accumulated_accuracy += accuracy
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            features.update(last_accuracy_title.copy())
            last_accuracy_title['acc_' + session_title] = accuracy
        
            features['accumulated_actions'] = accumulated_actions
            features['last_world']=last_world
            features['last_activity_type']=last_activity_type
            features['time_to_last_activity']=(session.iloc[0, 2]-time_last_activity).seconds if time_last_activity is not None else 0
            
            features.update(event_code_count.copy())

            features.update(installation_logger.get_data())
  
            features.update(title_count.copy())
            features.update(durations.copy())
            features.update(user_activities_count.copy())
            features.update(event_id_count.copy())
            features.update(give_up.copy())
            features.update(world_count.copy())
            features.update(game_time_dict.copy())
            
            if assess_durations == []:
                features['assess_duration_mean'] = 0
            else:
                features['assess_duration_mean'] = np.mean(assess_durations)
            assess_durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            
            
            if true_attempts+false_attempts==0:
                give_up["give_up_"+session_title]+=1
                
            if tree_top_city_list == []:
                features['tree_top_city_max'] = 0
                features['tree_top_city_cnt'] = 0
                features['tree_top_city_cover'] = 0
            else:
                features['tree_top_city_max'] = np.max(tree_top_city_list)
                features['tree_top_city_cnt'] = len(tree_top_city_list)
                features['tree_top_city_cover'] = float(len(set(tree_top_city_list))) / len(tree_top_city)
            if magma_peak_list == []:
                features['magma_peak_max'] = 0
                features['magma_peak_cnt'] = 0
                features['magma_peak_cover'] = 0
            else:
                features['magma_peak_max'] = np.max(magma_peak_list)
                features['magma_peak_cnt'] = len(magma_peak_list)
                features['magma_peak_cover'] = float(len(set(magma_peak_list))) / len(magma_peak)
            if crystal_caves_list == []:
                features['crystal_caves_max'] = 0
                features['crystal_caves_cnt'] = 0
                features['crystal_caves_cover'] = 0
            else:
                features['crystal_caves_max'] = np.max(crystal_caves_list)
                features['crystal_caves_cnt'] = len(crystal_caves_list)
                features['crystal_caves_cover'] = float(len(set(crystal_caves_list))) / len(crystal_caves)
            if session_title in tree_top_city:
                last_game = tree_top_city[session_title] - 1
                if last_game in tree_top_city_list:
                    features['played_last_game'] = 1
                else:
                    features['played_last_game'] = 0
            elif session_title in magma_peak:
                last_game = magma_peak[session_title] - 1
                if last_game in magma_peak_list:
                    features['played_last_game'] = 1
                else:
                    features['played_last_game'] = 0
            elif session_title in crystal_caves:
                last_game = crystal_caves[session_title] - 1
                if last_game in crystal_caves_list:
                    features['played_last_game'] = 1
                else:
                    features['played_last_game'] = 0
            else:
                features['played_last_game'] = 0
            
            if clip_durations == []:
                features['Clip_duration_mean'] = 0
                features['Clip_duration_std'] = 0
            else:
                features['Clip_duration_mean'] = np.mean(clip_durations)
                features['Clip_duration_std'] = np.std(clip_durations)
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
            counter += 1
        
        #log event data

        installation_logger.log_data(session)
        #update counters
        def update_counters(counter: dict, col: str):
            num_of_session_count = Counter(session[col])
            for k in num_of_session_count.keys():
                if counter.get(k) is not None:
                    counter[k] += num_of_session_count[k]
            return counter
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        
        title_count[session_title]+=1
        #title_event_code_count = update_counters(title_event_code_count, 'title_event_code')
        
        session_duration=(session.iloc[-1, 2] - session.iloc[0, 2]).seconds
        durations['ema_duration_'+session_title]= session_duration if durations['ema_duration_'+session_title]==0 else (0.2*session_duration+0.8*durations['ema_duration_'+session_title])
        accumulated_actions += len(session)

        user_activities_count[session_type] += 1
        world_count[session_world]+=1
        last_world = session_world 
        last_activity_type = session_type
        time_last_activity= session.iloc[-1, 2]  
        
        game_time_dict[session_type+'_gametime'] = (game_time_dict[session_type+'_gametime'] + (session['game_time'].iloc[-1]/1000.0))/2.0
        
        # sequence features update
        if session_title in tree_top_city:
            tree_top_city_list.append(tree_top_city[session_title])
        if session_title in magma_peak:
            magma_peak_list.append(magma_peak[session_title])
        if session_title in crystal_caves:
            crystal_caves_list.append(crystal_caves[session_title])
        
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    for i, (ins_id, user_sample) in enumerate(tqdm(train.groupby('installation_id', sort = False))):
        compiled_train += get_data(user_sample)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False)):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    return reduce_train, reduce_test          

reduce_train, reduce_test= get_train_and_test(train, test)
reduce_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_train.columns]
reduce_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_test.columns]
assess_titles=["".join (c if c.isalnum() else "_" for c in str(x)) for x in assess_titles]


In [None]:
type_enc={'Clip':0, 'Activity': 1, 'Assessment': 2, 'Game':3}
reduce_train['session_title']=reduce_train['session_title'].replace(title_enc).astype(int)
reduce_train['session_world']=reduce_train['session_world'].replace(world_enc).astype(int)
reduce_train['last_world']=reduce_train['last_world'].replace(world_enc).astype(int)
reduce_train['last_activity_type']=reduce_train['last_activity_type'].replace(type_enc).astype(int)

reduce_test['session_title']=reduce_test['session_title'].replace(title_enc).astype(int)
reduce_test['session_world']=reduce_test['session_world'].replace(world_enc).astype(int)
reduce_test['last_world']=reduce_test['last_world'].replace(world_enc).astype(int)
reduce_test['last_activity_type']=reduce_test['last_activity_type'].replace(type_enc).astype(int)

In [None]:
freq=reduce_train['session_title'].value_counts()
frequency_enc=dict(zip(freq.index,freq))
reduce_train['session_title']=reduce_train['session_title'].replace(frequency_enc).astype(int)
reduce_test['session_title']=reduce_test['session_title'].replace(frequency_enc).astype(int)
reduce_train_true=reduce_train.copy()
reduce_test_true=reduce_test.copy()

Get score

In [None]:
del train,test
gc.collect()

In [None]:
def read_data():
    if DEBUG:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv', nrows=100000)
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv', nrows=100000)
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))
    else:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv')
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv')
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv(dirs+'train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv(dirs+'specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv(dirs+'sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    return train, test, train_labels, specs, sample_submission
train, test, train_labels, specs, sample_submission = read_data()
train = train[train.installation_id.isin(train_labels.installation_id.unique())].reset_index(drop=True)
list_of_event_args = list(set(specs['args'].unique()))
event_args_map = dict(zip(list_of_event_args, np.arange(len(list_of_event_args))))
specs["args"]=specs["args"].map(event_args_map)

list_of_event_info = list(set(specs['info'].unique()))
event_info_map = dict(zip(list_of_event_info, np.arange(len(list_of_event_info))))
specs["info"]=specs["info"].map(event_info_map)

args_list=specs["args"].value_counts().add_prefix('args_').index.tolist()
args_label=dict(zip(np.arange(len(args_list)), args_list))
info_list=specs["info"].value_counts().add_prefix('info_').index.tolist()
info_label=dict(zip(np.arange(len(info_list)), info_list))

train=pd.merge(train,specs,on=["event_id"],how="left")
test=pd.merge(test,specs,on=["event_id"],how="left")
print(train.shape)

In [None]:
#Credits go to Andrew Lukyanenko

def encode_title(train, test, train_labels):
    # encode title

    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = sorted(list(set(train['title'].unique()).union(set(test['title'].unique()))))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = sorted(list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))))
    list_of_event_id = sorted(list(set(train['event_id'].unique()).union(set(test['event_id'].unique()))))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = sorted(list(set(train['world'].unique()).union(set(test['world'].unique()))))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = sorted(list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index))))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = sorted(list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique())))
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code

# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code = encode_title(train, test, train_labels)

categoricals = ['session_title']

In [None]:
import json
def cnt_miss(df):
    cnt = 0
    for e in range(len(df)):
        x = df['event_data'].iloc[e]
        y = json.loads(x)['misses']
        cnt += y
    return cnt

def update_counters(counter: dict, col: str,session):
    num_of_session_count = Counter(session[col])
    for k in num_of_session_count.keys():
        x = k
        counter[x] += num_of_session_count[k]
    return counter

def update_counters_event(counter: dict, col: str,session):
    num_of_session_count = session[col].value_counts().add_prefix(col+'_').to_dict()
    for k in num_of_session_count.keys():
        counter[k] += num_of_session_count[k]
    return counter

def get_data123(user_sample,lab, test_set=False):
    all_assessments=[]
    compiled_data=[]
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        Assessment_time=session['timestamp'].iloc[0]
        event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
        event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
        event_info_count: Dict[str, int] = {eve: 0 for eve in info_list}
        
        if session_type==lab:
            features = {}
            features["game_title"]=session_title
            features["game_event_count"]=session['event_count'].iloc[-1]
            features["game_game_time"]=session['game_time'].iloc[-1]
            event_code_count = update_counters(event_code_count, "event_code",session)
            #event_id_count = update_counters(event_id_count, "event_id",session)
            event_info_count = update_counters_event(event_info_count, "info",session)

            
            features.update(event_code_count.copy())
            features.update(event_info_count.copy())
            compiled_data.append(features)
           
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
         
             
          
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            if accuracy == 0:
                accuracy_group = 0
            elif accuracy == 1:
                accuracy_group = 3
            elif accuracy == 0.5:
                accuracy_group = 2
            else:
                accuracy_group = 1
            if test_set:
                if true_attempts+false_attempts > 0:
                    reduce_game = pd.DataFrame(compiled_data)
                    reduce_game['installation_id'] = session['installation_id'].iloc[-1]
                    reduce_game["game_session"]=session['game_session'].iloc[-1]
                    reduce_game["session_title"]=session_title
                    reduce_game["true_attempts"]=true_attempts
                    reduce_game["false_attempts"]=false_attempts
                    reduce_game["accuracy"]=accuracy
                    reduce_game["accuracy_group"]=accuracy_group
                    all_assessments.append(reduce_game)
                elif len(session)==1:
                    reduce_game = pd.DataFrame(compiled_data)
                    reduce_game['installation_id'] = session['installation_id'].iloc[-1]
                    reduce_game["game_session"]=session['game_session'].iloc[-1]
                    reduce_game["session_title"]=session_title
                    reduce_game["true_attempts"]=666
                    reduce_game["false_attempts"]=666
                    reduce_game["accuracy"]=666
                    reduce_game["accuracy_group"]=666
                    all_assessments.append(reduce_game)
                    
            
            elif true_attempts+false_attempts > 0:
                reduce_game = pd.DataFrame(compiled_data)
                reduce_game['installation_id'] = session['installation_id'].iloc[-1]
                reduce_game["game_session"]=session['game_session'].iloc[-1]
                reduce_game["session_title"]=session_title
                reduce_game["true_attempts"]=true_attempts
                reduce_game["false_attempts"]=false_attempts
                reduce_game["accuracy"]=accuracy
                reduce_game["accuracy_group"]=accuracy_group
                all_assessments.append(reduce_game)
    
    return all_assessments

In [None]:
compiled_data = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Game")
reduce_train = pd.concat(compiled_data)
reduce_train.shape

compiled_data = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Game", test_set=True)
reduce_test_all = pd.concat(compiled_data)
reduce_test_all.shape

reduce_test=reduce_test_all[reduce_test_all["accuracy"]==666].copy().reset_index(drop=True)
reduce_train=pd.concat([reduce_train,reduce_test_all[reduce_test_all["accuracy"]!=666]]).reset_index(drop=True)
reduce_train.shape

In [None]:
reduce_train.head()

In [None]:
reduce_test.head()

In [None]:
feat=[f for f in reduce_train.columns if f not in ['installation_id', 'game_session','true_attempts', 'false_attempts', 'accuracy', 'accuracy_group']]
print(feat)

In [None]:
from numba import jit
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def eval_qwk(y_true, y_pred):
    
#     y_pred[y_pred <= 1.12232214] = 0
#     y_pred[np.where(np.logical_and(y_pred > 1.12232214, y_pred <= 1.73925866))] = 1
#     y_pred[np.where(np.logical_and(y_pred > 1.73925866, y_pred <= 2.22506454))] = 2
#     y_pred[y_pred > 2.22506454] = 3
    #coeff=[1.23795619,1.74348425,2.23639873]
    coeff=[1.12934881,1.69659649,2.204893]
    y_pred[y_pred <= coeff[0]] = 0
    y_pred[np.where(np.logical_and(y_pred > coeff[0], y_pred <= coeff[1]))] = 1
    y_pred[np.where(np.logical_and(y_pred > coeff[1], y_pred <= coeff[2]))] = 2
    y_pred[y_pred > coeff[2]] = 3
    
#     y_pred[y_pred <= 0.94892782] = 0
#     y_pred[np.where(np.logical_and(y_pred > 0.94892782, y_pred <= 1.69))] = 1
#     y_pred[np.where(np.logical_and(y_pred > 1.69, y_pred <= 2.16))] = 2
#     y_pred[y_pred >2.16] = 3
    
    return qwk(y_true, y_pred)
import lightgbm as lgb
def lgb_model(reduce_train,reduce_test,feature,random_state):
    
    params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'eval_metric': 'cappa',
    #'metric': 'None',
    'num_threads':-1,
    'seed': random_state,
    'learning_rate':0.05,
    'max_depth': 11,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'colsample_bytree':0.6,
    'verbose': 100
    }
    # Additional parameters:
    early_stop = 50
    verbose_eval = 100
    num_rounds = 10000
    n_splits = 5

    from sklearn.model_selection import KFold,GroupKFold,GroupShuffleSplit,StratifiedKFold
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    models = []
    scores=[]
    y_trian = reduce_train['accuracy_group']
    oof_train = np.zeros((reduce_train.shape[0]))
    oof_test = np.zeros((reduce_test.shape[0]))
    for train_index,valid_index in kf.split(reduce_train, y_trian):
    
        train_features = reduce_train.loc[train_index]
        train_target = y_trian.loc[train_index]
    
        val_features = reduce_train.loc[valid_index]
        val_target = y_trian.loc[valid_index]
    
        X_train = train_features[feature].values
        X_val = val_features[feature].values
    
        d_train = lgb.Dataset(X_train, label=train_target)
        d_valid = lgb.Dataset(X_val, label=val_target)
        watchlist = [d_train, d_valid]
    
        print('training LGB:')
        model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    
        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        test_pred = model.predict(reduce_test[feature].values, num_iteration=model.best_iteration)
        scores.append(eval_qwk(val_target,copy.deepcopy(val_pred)))
        print(scores)
        oof_train[valid_index] = val_pred
        oof_test += test_pred/n_splits
    print(np.mean(scores))
    return oof_train,oof_test
print(reduce_train.shape,reduce_test.shape)
oof_train_one,oof_test_one=lgb_model(reduce_train.reset_index(drop=True),reduce_test.copy().reset_index(drop=True),feat,50)

In [None]:
train_feature_score = reduce_train[['installation_id', 'game_session']].copy()
train_feature_score['score'] = oof_train_one
test_feature_score = reduce_test[['installation_id', 'game_session']].copy()
test_feature_score['score'] = oof_test_one
feature_score = pd.concat([train_feature_score, test_feature_score])
feature_agg=feature_score.groupby(["game_session","installation_id"]).agg({'score': ['count','mean', 'sum', 'max','min','var']}).reset_index()
feature_agg.columns=["game_session","installation_id",'score_count','score_mean', 'score_sum', 'score_max', 'score_min', 'score_var']
print(feature_agg.shape)
feature_agg.head()

In [None]:
del train_feature_score,test_feature_score,feature_score
del reduce_train,reduce_test,reduce_test_all,compiled_data
gc.collect()

In [None]:
compiled_data = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Activity")
reduce_train = pd.concat(compiled_data)

compiled_data = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Activity", test_set=True)
reduce_test_all = pd.concat(compiled_data)

reduce_test=reduce_test_all[reduce_test_all["accuracy"]==666].copy().reset_index(drop=True)
reduce_train=pd.concat([reduce_train,reduce_test_all[reduce_test_all["accuracy"]!=666]]).reset_index(drop=True)
reduce_train.shape

feat=[f for f in reduce_train.columns if f not in ['installation_id', 'game_session','true_attempts', 'false_attempts', 'accuracy', 'accuracy_group']]
print(feat)

print(reduce_train.shape,reduce_test.shape)
oof_train_one,oof_test_one=lgb_model(reduce_train.reset_index(drop=True),reduce_test.copy().reset_index(drop=True),feat,50)

train_feature_score = reduce_train[['installation_id', 'game_session']].copy()
train_feature_score['score'] = oof_train_one
test_feature_score = reduce_test[['installation_id', 'game_session']].copy()
test_feature_score['score'] = oof_test_one
feature_score = pd.concat([train_feature_score, test_feature_score])

feature_agg_Activity=feature_score.groupby(["game_session","installation_id"]).agg({'score': ['count','mean', 'sum', 'max','min','var']}).reset_index()
feature_agg_Activity.columns=["game_session","installation_id",'score_countAct','score_meanAct', 'score_sumAct', 'score_maxAct', 'score_minAct', 'score_varAct']
print(feature_agg_Activity.shape)
feature_agg_Activity.head()

In [None]:
del train_feature_score,test_feature_score,feature_score
del reduce_train,reduce_test,reduce_test_all,compiled_data
gc.collect()

In [None]:
reduce_train_true.head()

In [None]:
reduce_train=pd.merge(reduce_train_true,feature_agg,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_train=pd.merge(reduce_train,feature_agg_Activity,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_test=pd.merge(reduce_test_true,feature_agg,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_test=pd.merge(reduce_test,feature_agg_Activity,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_test=reduce_test.drop(['game_session_y','game_session_x'],axis=1)

reduce_train.to_csv('reduce_train.csv', index=False)
reduce_test.to_csv('reduce_test.csv', index=False)

reduce_train=reduce_train[reduce_test.columns]
ajusted_test=reduce_test.copy()
to_exclude=[]
for feature in ajusted_test.columns:
    if feature not in ['installation_id','session_id','accuracy_group','session_title','session_world','last_world','last_activity_type','same_world_with_last']+\
                    ["give_up_"+assess for assess in assess_titles]+['acc_' + title for title in assess_titles]:
        data = reduce_train[feature]
        train_mean = data.mean()
        data = ajusted_test[feature] 
        test_mean = data.mean()
        try:
            ajust_factor = train_mean / test_mean
            if ajust_factor > 5 or ajust_factor < 0.2:
                to_exclude.append(feature)
                print(feature, train_mean, test_mean)
            else:
                ajusted_test[feature] *= ajust_factor
        except:
            to_exclude.append(feature)
            print(feature, train_mean, test_mean)


In [None]:
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    y_pred[y_pred <= regression_thresholds[0]] = 0
    y_pred[np.where(np.logical_and(y_pred >regression_thresholds[0], y_pred <= regression_thresholds[1]))] = 1
    y_pred[np.where(np.logical_and(y_pred > regression_thresholds[1], y_pred <= regression_thresholds[2]))] = 2
    y_pred[y_pred > regression_thresholds[2]] = 3

    # y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'cappa', qwk(y_true, y_pred), True

class LGBWrapper_regr(object):
    """
    A wrapper for lightgbm model so that we will have a single api for various models.
    """

    def __init__(self):
        self.model = lgb.LGBMRegressor()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):
        if params['objective'] == 'regression':
            eval_metric = eval_qwk_lgb_regr
        else:
            eval_metric = 'auc'

        eval_set = [(X_train, y_train)]
        eval_names = ['train']
        self.model = self.model.set_params(**params)

        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
            eval_names.append('valid')

        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))
            eval_names.append('holdout')

        if 'cat_cols' in params.keys():
            cat_cols = [col for col in params['cat_cols'] if col in X_train.columns]
            if len(cat_cols) > 0:
                categorical_columns = params['cat_cols']
            else:
                categorical_columns = 'auto'
        else:
            categorical_columns = 'auto'
        #print(categorical_columns)
        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_names=eval_names, eval_metric=eval_metric,
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'],
                       categorical_feature=categorical_columns)

        self.best_score_ = self.model.best_score_
        self.feature_importances_ = self.model.feature_importances_

    def predict(self, X_test):
        return self.model.predict(X_test, num_iteration=self.model.best_iteration_)
    
    
class RegressorModel(object):
    """
    A wrapper class for classification models.
    It can be used for training and prediction.
    Can plot feature importance and training progress (if relevant for model).
    """

    def __init__(self, columns: list = None, model_wrapper=None,truncate_valid=False,seed=66):
        """
        :param original_columns:
        :param model_wrapper:
        """
        self.columns = columns
        self.model_wrapper = model_wrapper
        self.result_dict = {}
        self.train_one_fold = False
        self.preprocesser = None
        self.truncate_valid=truncate_valid
        self.truncate_seed=seed

    def fit(self, X: pd.DataFrame, y,
            X_holdout: pd.DataFrame = None, y_holdout=None,
            folds=None,
            params: dict = None,
            eval_metric='rmse',
            cols_to_drop: list = None,
            preprocesser=None,
            transformers: dict = None,
            adversarial: bool = False,
            plot: bool = True):
        """
        Training the model.

        :param X: training data
        :param y: training target
        :param X_holdout: holdout data
        :param y_holdout: holdout target
        :param folds: folds to split the data. If not defined, then model will be trained on the whole X
        :param params: training parameters
        :param eval_metric: metric for validataion
        :param cols_to_drop: list of columns to drop (for example ID)
        :param preprocesser: preprocesser class
        :param transformers: transformer to use on folds
        :param adversarial
        :return:
        """

        if folds is None:
            folds = KFold(n_splits=3, random_state=42)
            self.train_one_fold = True

        self.columns = X.columns if self.columns is None else self.columns
        self.feature_importances = pd.DataFrame(columns=['feature', 'importance'])
        self.trained_transformers = {k: [] for k in transformers}
        self.transformers = transformers
        self.models = []
        self.folds_dict = {}
        self.eval_metric = eval_metric
        n_target = 1
        self.oof = []
        self.n_target = n_target
        random.seed(self.truncate_seed)
        X = X[self.columns]
        if X_holdout is not None:
            X_holdout = X_holdout[self.columns]

        if preprocesser is not None:
            self.preprocesser = preprocesser
            self.preprocesser.fit(X, y)
            X = self.preprocesser.transform(X, y)
            self.columns = X.columns.tolist()
            if X_holdout is not None:
                X_holdout = self.preprocesser.transform(X_holdout)

        for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y, X['installation_id'])):

            if X_holdout is not None:
                X_hold = X_holdout.copy()
            else:
                X_hold = None
            self.folds_dict[fold_n] = {}
            if params['verbose']:
                print(f'Fold {fold_n + 1} started at {time.ctime()}')
            self.folds_dict[fold_n] = {}

            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            if self.truncate_valid:
                
                truncated_index=[]
                for iid in sorted(list(set(X_valid['installation_id']))):
                    list_ = list(X_valid.loc[X_valid['installation_id'] == iid].index)
                    cur = random.choices(list_, k=1)[0]
                    truncated_index.append(cur)
                X_valid=X_valid.loc[truncated_index]
                y_valid=y_valid.loc[truncated_index]  
            
            if self.train_one_fold:
                X_train = X[self.original_columns]
                y_train = y
                X_valid = None
                y_valid = None

            datasets = {'X_train': X_train, 'X_valid': X_valid, 'X_holdout': X_hold, 'y_train': y_train}
            X_train, X_valid, X_hold = self.transform_(datasets, cols_to_drop)

            self.folds_dict[fold_n]['columns'] = X_train.columns.tolist()

            model = copy.deepcopy(self.model_wrapper)

            if adversarial:
                X_new1 = X_train.copy()
                if X_valid is not None:
                    X_new2 = X_valid.copy()
                elif X_holdout is not None:
                    X_new2 = X_holdout.copy()
                X_new = pd.concat([X_new1, X_new2], axis=0)
                y_new = np.hstack((np.zeros((X_new1.shape[0])), np.ones((X_new2.shape[0]))))
                X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new)

            model.fit(X_train, y_train, X_valid, y_valid, X_hold, y_holdout, params=params)

            self.folds_dict[fold_n]['scores'] = model.best_score_

            if not adversarial:
                self.oof.append([model.predict(X_valid).reshape(-1),y_valid.values])

            fold_importance = pd.DataFrame(list(zip(X_train.columns, model.feature_importances_)),
                                           columns=['feature', 'importance'])
            self.feature_importances = self.feature_importances.append(fold_importance)
            self.models.append(model)

        self.feature_importances['importance'] = self.feature_importances['importance'].astype(int)

        # if params['verbose']:
        self.calc_scores_()

        if plot:
            # print(classification_report(y, self.oof.argmax(1)))
            fig, ax = plt.subplots(figsize=(16, 12))
            plt.subplot(2, 2, 1)
            self.plot_feature_importance(top_n=20)
            plt.subplot(2, 2, 2)
            self.plot_metric()
            if not self.truncate_valid:
                plt.subplot(2, 2, 3)
                plt.hist(y.values.reshape(-1, 1) - self.oof)
                plt.title('Distribution of errors')
                plt.subplot(2, 2, 4)
                plt.hist(self.oof)
                plt.title('Distribution of oof predictions')

    def transform_(self, datasets, cols_to_drop):
        for name, transformer in self.transformers.items():
            transformer.fit(datasets['X_train'], datasets['y_train'])
            datasets['X_train'] = transformer.transform(datasets['X_train'])
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = transformer.transform(datasets['X_valid'])
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = transformer.transform(datasets['X_holdout'])
            self.trained_transformers[name].append(transformer)
        if cols_to_drop is not None:
            cols_to_drop = [col for col in cols_to_drop if col in datasets['X_train'].columns]

            datasets['X_train'] = datasets['X_train'].drop(cols_to_drop, axis=1)
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = datasets['X_valid'].drop(cols_to_drop, axis=1)
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = datasets['X_holdout'].drop(cols_to_drop, axis=1)
        self.cols_to_drop = cols_to_drop
        print("Dropping",len(set(cols_to_drop)) ,"columns First 10:",cols_to_drop[:10])
        return datasets['X_train'], datasets['X_valid'], datasets['X_holdout']

    def calc_scores_(self):
        print()
        datasets = [k for k, v in [v['scores'] for k, v in self.folds_dict.items()][0].items() if len(v) > 0]
        self.scores = {}
        for d in datasets:
            scores = [v['scores'][d][self.eval_metric] for k, v in self.folds_dict.items()]
            print(f"CV mean score on {d}: {np.mean(scores):.4f} +/- {np.std(scores):.4f} std.")
            self.scores[d] = np.mean(scores)

    def predict(self, X_test, averaging: str = 'usual'):
        """
        Make prediction

        :param X_test:
        :param averaging: method of averaging
        :return:
        """
        full_prediction = np.zeros((X_test.shape[0],1))
        if self.preprocesser is not None:
            X_test = self.preprocesser.transform(X_test)
        for i in range(len(self.models)):
            X_t = X_test.copy()
            for name, transformers in self.trained_transformers.items():
                X_t = transformers[i].transform(X_t)

            if self.cols_to_drop is not None:
                cols_to_drop = [col for col in self.cols_to_drop if col in X_t.columns]
                X_t = X_t.drop(cols_to_drop, axis=1)
            y_pred = self.models[i].predict(X_t[self.folds_dict[i]['columns']]).reshape(-1, full_prediction.shape[1])

            # if case transformation changes the number of the rows
            if full_prediction.shape[0] != len(y_pred):
                full_prediction = np.zeros((y_pred.shape[0], 1))

            if averaging == 'usual':
                full_prediction += y_pred
            elif averaging == 'rank':
                full_prediction += pd.Series(y_pred).rank().values

        return full_prediction / len(self.models)

    def plot_feature_importance(self, drop_null_importance: bool = True, top_n: int = 10):
        """
        Plot default feature importance.

        :param drop_null_importance: drop columns with null feature importance
        :param top_n: show top n columns
        :return:
        """

        top_feats = self.get_top_features(drop_null_importance, top_n)
        feature_importances = self.feature_importances.loc[self.feature_importances['feature'].isin(top_feats)]
        feature_importances['feature'] = feature_importances['feature'].astype(str)
        top_feats = [str(i) for i in top_feats]
        sns.barplot(data=feature_importances, x='importance', y='feature', orient='h', order=top_feats)
        plt.title('Feature importances')

    def get_top_features(self, drop_null_importance: bool = True, top_n: int = 10):
        """
        Get top features by importance.

        :param drop_null_importance:
        :param top_n:
        :return:
        """
        grouped_feats = self.feature_importances.groupby(['feature'])['importance'].mean()
        if drop_null_importance:
            grouped_feats = grouped_feats[grouped_feats != 0]
        return list(grouped_feats.sort_values(ascending=False).index)[:top_n]

    def plot_metric(self):
        """
        Plot training progress.
        Inspired by `plot_metric` from https://lightgbm.readthedocs.io/en/latest/_modules/lightgbm/plotting.html

        :return:
        """
        full_evals_results = pd.DataFrame()
        for model in self.models:
            evals_result = pd.DataFrame()
            for k in model.model.evals_result_.keys():
                evals_result[k] = model.model.evals_result_[k][self.eval_metric]
            evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'})
            full_evals_results = full_evals_results.append(evals_result)

        full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric,
                                                                                            'variable': 'dataset'})
        sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset')
        plt.title('Training progress')


In [None]:
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y,initial_coef):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [None]:
del_cols = []
for col in reduce_train.columns.values:
    if len(reduce_train[col].value_counts())==0:
        del_cols.append(col)
        continue
    counts = reduce_train[col].value_counts().iloc[0]
    if (counts / reduce_train.shape[0]) >= 0.9:
        del_cols.append(col)
print(str(len(del_cols)) + " features removed!")
del_cols

In [None]:
counter = 0
to_remove = []
neglect_feat=['installation_id','session_id','accuracy_group']

features=[x for x in reduce_train.columns if x not in neglect_feat]
for feat_a in features:
    for feat_b in features:
        if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
            c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
            if c > 0.995:
                counter += 1
                to_remove.append(feat_b)
                print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))

In [None]:
list_of_event_code = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in list_of_event_code]
list_of_event_id = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in list_of_event_id]

cols_to_drop = ['session_id', 'installation_id','accuracy_group',
                'installation_session_count',
                'installation_duration_mean',
                'installation_title_nunique',
                'installation_event_code_count_mean',
                "4070",

               ]
cols_to_drop+=del_cols
cols_to_drop+=to_exclude
cols_to_drop+=to_remove
#for cols in same_features.values():
#    cols_to_drop+=cols
print(len(set(cols_to_drop)))
cols_to_drop

In [None]:
categoricals=['session_title','session_world','last_world','last_activity_type']

params = {'n_estimators':2000,
          'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'subsample': 0.85,
          'subsample_freq': 1,
          'learning_rate': 0.01,
          'feature_fraction': 0.75,
          'max_depth': 10,
          'num_leaves':31,
          'min_data_in_leaf':50,
          'cat_cols':categoricals,
          'lambda_l1': 2,
          'lambda_l2':9,
          'verbose': 100,
          'early_stopping_rounds': 200,
          'eval_metric': 'cappa',
          'seed':888,
          'n_jobs':8
         }
n_fold=5
folds = GroupKFold(n_splits=n_fold)

In [None]:
#regression_thresholds=np.array([1.18400496,1.65723726,2.13351805])
#regression_thresholds=np.array([1.1,1.7,2.2])
regression_thresholds=np.array([0.5,1.5,2.5])
y = reduce_train['accuracy_group']
regressor_model1 = RegressorModel(model_wrapper=LGBWrapper_regr(),truncate_valid=False,)
regressor_model1.fit(X=reduce_train, y=y, folds=folds, params=params, preprocesser=None, transformers={},
                    eval_metric='cappa', cols_to_drop=cols_to_drop,plot=False)

In [None]:
oof_predicts=np.concatenate([x[0] for x in regressor_model1.oof],axis=0)
oof_y=np.concatenate([x[1] for x in regressor_model1.oof],axis=0)

#oof_predicts=regressor_model1.oof

coefficients=[0.5,1.5,2.5]

for i in range(8):
    optR = OptimizedRounder()
    optR.fit(oof_predicts.reshape(-1,), oof_y,initial_coef=coefficients)
    coefficients = optR.coefficients()
    oof_rounded=optR.predict(oof_predicts.reshape(-1,),coefficients)
    qwk_score=qwk(oof_y, oof_rounded)
    print("Round",i+1,"    Rounding Coefficients:",coefficients,"QWK score:",qwk_score)

coef1=coefficients
qwkscore1=qwk_score

oof_predicts=np.concatenate([x[0] for x in regressor_model1.oof],axis=0)
oof_y=np.concatenate([x[1] for x in regressor_model1.oof],axis=0)
coefficients=[1.1,1.7,2.2]

for i in range(8):
    optR = OptimizedRounder()
    optR.fit(oof_predicts.reshape(-1,), oof_y,initial_coef=coefficients)
    coefficients = optR.coefficients()
    oof_rounded=optR.predict(oof_predicts.reshape(-1,),coefficients)
    qwk_score=qwk(oof_y, oof_rounded)
    print("Round",i+1,"    Rounding Coefficients:",coefficients,"QWK score:",qwk_score)
coef2=coefficients
qwkscore2=qwk_score
if qwkscore2>qwkscore1:
    print("use coefficient2",qwkscore2)
    coefficients=coef2
else:
    print("use coefficient1",qwkscore1)
    coefficients=coef1

In [None]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv()
        self.verbose = verbose
        self.params = self.get_params()
        self.y_pred, self.score, self.model = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        return cv.split(self.train_df, self.train_df[self.target])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
    
    def fit(self):
        oof_pred = np.zeros((len(reduce_train), ))
        y_pred = np.zeros((len(reduce_test), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            conv_x_val = self.convert_x(x_val)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            print('Partial score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val, oof_pred[val_idx])[1]))
        _, loss_score, _ = eval_qwk_lgb_regr(self.train_df[self.target], oof_pred)
        if self.verbose:
            print('Our oof cohen kappa score is: ', loss_score)
        return y_pred, loss_score, model

In [None]:
class Xgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return xgb.train(self.params, train_set, 
                         num_boost_round=5000, evals=[(train_set, 'train'), (val_set, 'val')], 
                         verbose_eval=verbosity, early_stopping_rounds=100)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set
    
    def convert_x(self, x):
        return xgb.DMatrix(x)
        
    def get_params(self):
        params = {'colsample_bytree': 0.8,                 
            'learning_rate': 0.01,
            'max_depth': 10,
            'subsample': 1,
            'objective':'reg:squarederror',
            #'eval_metric':'rmse',
            'min_child_weight':3,
            'gamma':0.25,
            'n_estimators':5000}

        return params

In [None]:
xgb_feature=[x for x in reduce_train.columns if x not in cols_to_drop]
xgb_model = Xgb_Model(reduce_train, ajusted_test, features, categoricals=categoricals)

In [None]:
from catboost import CatBoostRegressor
class Catb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        clf = CatBoostRegressor(**self.params)
        clf.fit(train_set['X'], 
                train_set['y'], 
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity, 
                cat_features=self.categoricals
               )
        return clf
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        params = {'loss_function': 'RMSE',
                   'task_type': "CPU",
                   'iterations': 5000,
                   'od_type': "Iter",
                    'depth': 10,
                  'colsample_bylevel': 0.5, 
                   'early_stopping_rounds': 100,
                    'l2_leaf_reg': 18,
                   'random_seed': 42,
                    'use_best_model': True
                    }
        return params

In [None]:
ctb_feature=[x for x in reduce_train.columns if x not in cols_to_drop]
ctb_model = Catb_Model(reduce_train, ajusted_test, features, categoricals=categoricals)

In [None]:
final_pred = 0.7*regressor_model1.predict(ajusted_test)+0.2*xgb_model.y_pred.reshape(-1,1)+0.1*ctb_model.y_pred.reshape(-1,1)
dist = Counter(reduce_train['accuracy_group'])
#dist = Counter(oof_y)
for k in dist:
    dist[k] /= len(reduce_train['accuracy_group'])
acum = 0
bound = np.zeros(3).astype(np.float)
for i in range(3):
    acum += dist[i]
    bound[i] = np.percentile(final_pred, acum * 100)
    
print(bound)

def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3

final_pred = np.array(list(map(classify, final_pred)))
sample_submission['accuracy_group'] = final_pred.astype(int)
sample_submission.to_csv('./submission.csv', index=False)
sample_submission['accuracy_group'].value_counts(normalize=True)

In [None]:
oof_predicts=np.concatenate([x[0] for x in regressor_model1.oof],axis=0)
oof_y=np.concatenate([x[1] for x in regressor_model1.oof],axis=0)
oof_final_pred=np.array(list(map(classify, oof_predicts)))
qwk(oof_final_pred,oof_y)