In [None]:


# ## IMPORT

# In[ ]:


import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostClassifier, CatBoostRegressor
import time
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import confusion_matrix
import pdb
import re
from sklearn.preprocessing import LabelEncoder
import gc
import json, warnings
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt 
import seaborn as sns
import random as rn
import tensorflow as tf
from keras import backend as K
from collections import Counter
from sklearn.metrics import cohen_kappa_score, mean_squared_error
warnings.filterwarnings('ignore')
def init_seeds(seed):

    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.

    np.random.seed(seed)

    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.

    rn.seed(seed)

    tf.random.set_seed(seed)

SEED = 666
init_seeds(SEED)
on_kaggle = True
test_code = False
test_for_training = True
null_importance_cut = -0.3


# In[ ]:


from sklearn.metrics import confusion_matrix
def qwk(act,pred,n=4,hist_range=(0,3), weights = None):
    O = confusion_matrix(act,pred,sample_weight = weights)
    O = np.divide(O,np.sum(O)) #Agreement Actual

    W = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            W[i][j] = ((i-j)**2)/((n-1)**2)

    act_hist = np.histogram(act,bins=n,range=hist_range, weights=weights)[0]
    prd_hist = np.histogram(pred,bins=n,range=hist_range, weights= weights)[0]

    E = np.outer(act_hist,prd_hist)
    E = np.divide(E,np.sum(E)) #Agreement Expectation

    num = np.sum(np.multiply(W,O)) #Weighted Agreement Actual
    den = np.sum(np.multiply(W,E)) #Weighted Agreement Expectation

    return 1-np.divide(num,den)

class softkappaObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        y = targets
        p = approxes
        norm = np.dot(p, p) + np.dot(y, y) 
        grad = -np.multiply(2, y) / norm + np.multiply(4,  p) * np.dot(y, p) / (np.power(norm, 2))
        hess = np.multiply(8, p) * y /  (np.power(norm, 2)) + 4 * np.dot(y, p) / (np.power(norm, 2))        - (np.multiply(16,  (np.power(p, 2))) * np.dot(y, p)) / (np.power(norm, 3))
        return zip(-grad, -hess)
    

def qwk_lgb(y_true, y_pred):

    """
    from https://www.kaggle.com/nikhilpraveen/convert-to-regression
    Fast cappa eval function for lgb.
    """
    dist = Counter(new_train['accuracy_group'])
    for k in dist:
        dist[k] /= len(new_train)
    new_train['accuracy_group'].hist()
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred))).reshape(y_true.shape)

    return 'cappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True

class qwk_catboost(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        pred = np.array(approxes).flatten()
        act = np.array(target).flatten()
        weights = None
        n=4
        hist_range=(0,3)
        cut1 = np.sort(pred)[np.int(np.round(len(act) * 0.239062))] 
        cut2 = np.sort(pred)[np.int(np.round(len(act) * 0.375353))] 
        cut3 = np.sort(pred)[np.int(np.round(len(act) * 0.500000))] 
        pred[pred <= cut1] = 0
        pred[(pred >= cut1) & (pred <= cut2)] = 1
        pred[(pred >= cut2) & (pred <= cut3)] = 2
        pred[(pred >= cut3)] = 3
        
        O = confusion_matrix(act,pred,sample_weight = weights)
        O = np.divide(O,np.sum(O)) #Agreement Actual

        W = np.zeros((n,n))
        for i in range(n):
            for j in range(n):
                W[i][j] = ((i-j)**2)/((n-1)**2)

        act_hist = np.histogram(act,bins=n,range=hist_range, weights=weights)[0]
        prd_hist = np.histogram(pred,bins=n,range=hist_range, weights= weights)[0]

        E = np.outer(act_hist,prd_hist)
        E = np.divide(E,np.sum(E)) #Agreement Expectation

        num = np.sum(np.multiply(W,O)) #Weighted Agreement Actual
        den = np.sum(np.multiply(W,E)) #Weighted Agreement Expectation

        return 1-np.divide(num,den), 1

    



class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        set_ = set(self.label_encoder.classes_)
        new_data_list = [x if x in set_ else 'Unknown' for x in new_data_list]

        return self.label_encoder.transform(new_data_list)


# In[ ]:


if on_kaggle:
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
else:
    train = pd.read_csv('../DSB/train.csv')
    train_labels = pd.read_csv('../DSB/train_labels.csv')
    specs = pd.read_csv('../DSB/specs.csv')
    test = pd.read_csv('../DSB/test.csv')
    submission = pd.read_csv('../DSB/sample_submission.csv')

In [None]:
# In[ ]:

import psutil
import os
def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])
train['timestamp'] = train['timestamp'].map(lambda x: x.timestamp()).astype(np.float32)
test['timestamp'] = test['timestamp'].map(lambda x: x.timestamp()).astype(np.float32)


# In[ ]:


### 用test中的数据training

memorytest = False
if test_for_training:
    train_cols = list(train.columns)
    test['index_'] = list(test.sort_values(['installation_id', 'timestamp']).index)
    assess_index_  = set(test.groupby('installation_id')['index_'].max().values)
    test_for_train = test.loc[~test['index_'].isin(assess_index_), :].copy()
    
    if memorytest:
        test_for_train_copy = test_for_train.copy()
        for i in range(9):
            test_for_train2 = test_for_train_copy.copy()
            test_for_train2['installation_id'] += str(i)
            test_for_train2['game_session'] += str(i)
            test_for_train = pd.concat([test_for_train, test_for_train2.copy()], axis= 0)
            del test_for_train2
        del test_for_train_copy
    

    print(cpu_stats())
    print(test_for_train.shape)
    
    test_for_train.drop(columns=['index_'], inplace= True)
    test.drop(columns=['index_'], inplace= True)

    train = train[train_cols ]
    test_for_train = test_for_train[train_cols]

    del assess_index_
    print(train.shape)
    print(cpu_stats())    


# In[ ]:


## hard enconde title to 防止因为随机编码变化
activities_map = {'Dino Drink': 0, 'Ordering Spheres': 1, 'Tree Top City - Level 1': 2, 'Chicken Balancer (Activity)': 3, 'Scrub-A-Dub': 4, "Pirate's Tale": 5, 'Honey Cake': 6, 'Flower Waterer (Activity)': 7, 'All Star Sorting': 8, 'Tree Top City - Level 2': 9, 'Pan Balance': 10, 'Bird Measurer (Assessment)': 11, 'Bottle Filler (Activity)': 12, 'Air Show': 13, 'Fireworks (Activity)': 14, 'Slop Problem': 15, 'Magma Peak - Level 2': 16, 'Dino Dive': 17, 'Leaf Leader': 18, 'Costume Box': 19, 'Mushroom Sorter (Assessment)': 20, 'Chest Sorter (Assessment)': 21, 'Tree Top City - Level 3': 22, '12 Monkeys': 23, 'Crystal Caves - Level 2': 24, 'Treasure Map': 25, 'Rulers': 26, 'Crystals Rule': 27, 'Magma Peak - Level 1': 28, 'Cart Balancer (Assessment)': 29, 'Egg Dropper (Activity)': 30, 'Bug Measurer (Activity)': 31, 'Sandcastle Builder (Activity)': 32, 'Bubble Bath': 33, 'Crystal Caves - Level 1': 34, 'Welcome to Lost Lagoon!': 35, 'Watering Hole (Activity)': 36, 'Crystal Caves - Level 3': 37, 'Heavy, Heavier, Heaviest': 38, 'Lifting Heavy Things': 39, 'Balancing Act': 40, 'Cauldron Filler (Assessment)': 41, 'Chow Time': 42, 'Happy Camel': 43}
train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)
test_for_train['title'] = test_for_train['title'].map(activities_map)

# In[ ]:


eventid_map = {'3393b68b': '0', '1575e76c': '1', '6c930e6e': '2', '3bfd1a65': '3', 'ac92046e': '4', '01ca3a3c': '5', '2dc29e21': '6', 'acf5c23f': '7', 'd51b1749': '8', '47026d5f': '9', 'f56e0afc': '10', 'ea296733': '11', 'bb3e370b': '12', 'e64e2cfd': '13', 'c51d8688': '14', 'd185d3ea': '15', 'a2df0760': '16', 'daac11b0': '17', '93edfe2e': '18', '4901243f': '19', 'd02b7a8e': '20', '9554a50b': '21', '7423acbc': '22', 'b7dc8128': '23', '04df9b66': '24', 'a1e4395d': '25', '9de5e594': '26', 'ad148f58': '27', '15f99afc': '28', '33505eae': '29', '3d0b9317': '30', '565a3990': '31', '9d4e7b25': '32', 'ab4ec3a4': '33', '736f9581': '34', '56817e2b': '35', 'abc5811c': '36', 'cc5087a3': '37', '90efca10': '38', 'cb6010f8': '39', '44cb4907': '40', 'a76029ee': '41', '6f8106d9': '42', '3a4be871': '43', '2230fab4': '44', '4074bac2': '45', '14de4c5d': '46', '15eb4a7d': '47', '1b54d27f': '48', '30df3273': '49', 'bd701df8': '50', '3ee399c3': '51', '9b23e8ee': '52', '29f54413': '53', 'd45ed6a1': '54', '47f43a44': '55', '5348fd84': '56', '7525289a': '57', '611485c5': '58', 'b80e5e84': '59', '1325467d': '60', '77ead60d': '61', '7fd1ac25': '62', '55115cbd': '63', '3dfd4aa4': '64', '5e109ec3': '65', '89aace00': '66', 'a6d66e51': '67', '3d8c61b0': '68', 'a0faea5d': '69', 'c74f40cd': '70', '4c2ec19f': '71', 'f7e47413': '72', 'b012cd7f': '73', 'ad2fc29c': '74', '8d7e386c': '75', '0330ab6a': '76', 'de26c3a6': '77', '5dc079d8': '78', 'c1cac9a2': '79', '7dfe6d8a': '80', 'e720d930': '81', '77c76bc5': '82', '93b353f2': '83', '804ee27f': '84', '828e68f9': '85', 'f54238ee': '86', 'e9c52111': '87', '53c6e11a': '88', '003cd2ee': '89', '1beb320a': '90', '8f094001': '91', 'e37a2b78': '92', '6cf7d25c': '93', '71e712d8': '94', 'e080a381': '95', '0ce40006': '96', '26fd2d99': '97', 'fcfdffb6': '98', '6f445b57': '99', 'f71c4741': '100', 'f5b8c21a': '101', '2a444e03': '102', 'bdf49a58': '103', '5c2f29ca': '104', '5d042115': '105', 'd2e9262e': '106', '587b5989': '107', 'd3268efa': '108', '7040c096': '109', 'eb2c19cd': '110', '9d29771f': '111', '363d3849': '112', '5290eab1': '113', '46b50ba8': '114', '9ee1c98c': '115', 'b5053438': '116', 'a8cc6fec': '117', '2fb91ec1': '118', '29bdd9ba': '119', '5154fc30': '120', 'c2baf0bd': '121', '87d743c1': '122', '06372577': '123', '00c73085': '124', 'b88f38da': '125', 'c7f7f0e1': '126', 'a52b92d5': '127', '47efca07': '128', 'dcaede90': '129', '5b49460a': '130', 'beb0a7b9': '131', '85d1b0de': '132', '598f4598': '133', '05ad839b': '134', '5be391b5': '135', 'e694a35b': '136', 'd2278a3b': '137', '1af8be29': '138', 'a44b10dc': '139', 'fbaf3456': '140', '4a4c3d21': '141', '532a2afb': '142', '90d848e0': '143', 'e04fb33d': '144', 'ecaab346': '145', '16dffff1': '146', 'a592d54e': '147', '4d911100': '148', 'd3640339': '149', 'b74258a0': '150', 'a16a373e': '151', '8b757ab8': '152', '65a38bf7': '153', '6aeafed4': '154', '3babcb9b': '155', '25fa8af4': '156', 'e7561dd2': '157', '83c6c409': '158', 'b2e5b0f1': '159', '19967db1': '160', '29a42aea': '161', '9e4c8c7b': '162', '17113b36': '163', 'f50fc6c1': '164', '363c86c9': '165', 'e3ff61fb': '166', 'dcb1663e': '167', '1340b8d7': '168', '51311d7a': '169', 'bc8f2793': '170', '16667cc5': '171', '9e34ea74': '172', '67439901': '173', '7d093bf9': '174', 'e79f3763': '175', '02a42007': '176', '91561152': '177', '2c4e6db0': '178', 'c189aaf2': '179', '3ccd3f02': '180', '795e4a37': '181', '7d5c30a2': '182', '65abac75': '183', '28f975ea': '184', '3bf1cf26': '185', '756e5507': '186', '71fe8f75': '187', '3afb49e6': '188', '0413e89d': '189', '28a4eb9a': '190', 'a8876db3': '191', '69fdac0a': '192', '7ab78247': '193', '0d1da71f': '194', '709b1251': '195', '3bb91ced': '196', '76babcde': '197', 'a29c5338': '198', '30614231': '199', '0a08139c': '200', 'db02c830': '201', 'e4d32835': '202', '9ed8f6da': '203', '56cd3b43': '204', 'f3cd5473': '205', 'd88e8f25': '206', '51102b85': '207', 'a8a78786': '208', 'cf7638f3': '209', '5f5b2617': '210', '070a5291': '211', '4d6737eb': '212', '84b0e0c8': '213', '3323d7e9': '214', '0db6d71d': '215', 'c7128948': '216', '84538528': '217', 'f6947f54': '218', 'cf82af56': '219', 'b2dba42b': '220', '1cc7cfca': '221', '08fd73f3': '222', 'df4940d3': '223', '4ef8cdd3': '224', '58a0de5c': '225', 'dcb55a27': '226', 'e57dd7af': '227', 'd2659ab4': '228', 'a1bbe385': '229', 'ca11f653': '230', 'd38c2fd7': '231', '155f62a4': '232', '2b9272f4': '233', '022b4259': '234', 'cdd22e43': '235', '6f4adc4b': '236', 'bbfe0445': '237', 'cfbd47c8': '238', '0086365d': '239', 'f32856e4': '240', '88d4a5be': '241', '37db1c2f': '242', '884228c8': '243', '5e3ea25a': '244', 'f93fc684': '245', '85de926c': '246', 'e4f1efe6': '247', '7cf1bc53': '248', '2b058fe3': '249', '1375ccb7': '250', 'ab3136ba': '251', '763fc34e': '252', 'ecc6157f': '253', 'c54cf6c5': '254', '28ed704e': '255', '2dcad279': '256', '2a512369': '257', 'd9c005dd': '258', '8d748b58': '259', '1996c610': '260', '0d18d96c': '261', '832735e1': '262', '7da34a02': '263', 'd06f75b5': '264', 'f806dc10': '265', '222660ff': '266', '5859dfb6': '267', '77261ab5': '268', '67aa2ada': '269', 'e7e44842': '270', '17ca3959': '271', 'bfc77bd6': '272', 'fd20ea40': '273', 'b7530680': '274', '37c53127': '275', '461eace6': '276', '6088b756': '277', 'c277e121': '278', 'f28c589a': '279', '262136f4': '280', '99ea62f3': '281', '37ee8496': '282', '8fee50e2': '283', '4bb2f698': '284', '119b5b02': '285', '28520915': '286', '5de79a6a': '287', '38074c54': '288', '31973d56': '289', '857f21c0': '290', '923afab1': '291', 'cb1178ad': '292', 'b1d5101d': '293', 'b120f2ac': '294', '36fa3ebe': '295', '6bf9e3e1': '296', 'ec138c1c': '297', '5e812b27': '298', 'a8efe47b': '299', '86c924c4': '300', 'bcceccc6': '301', '562cec5f': '302', '48349b14': '303', '392e14df': '304', 'e5c9df6f': '305', '26a5a3dd': '306', 'ea321fb1': '307', '6d90d394': '308', '7372e1a5': '309', '3edf6747': '310', '6c517a88': '311', '3ddc79c3': '312', '99abe2bb': '313', '9b4001e4': '314', '74e5f8a7': '315', '5c3d2b2f': '316', 'a5be6304': '317', '3d63345e': '318', 'b738d3d3': '319', '4b5efe37': '320', '3b2048ee': '321', 'a5e9da97': '322', 'c952eb01': '323', '9b01374f': '324', '7f0836bf': '325', '37937459': '326', '9e6b7fb5': '327', '13f56524': '328', '73757a5e': '329', '15ba1109': '330', '08ff79ad': '331', '1bb5fbdb': '332', '45d01abe': '333', '2ec694de': '334', 'a1192f43': '335', 'ecc36b7f': '336', '250513af': '337', 'e5734469': '338', '3bb91dda': '339', 'c58186bf': '340', '15a43e5b': '341', '6077cc36': '342', '3afde5dd': '343', '8d84fa81': '344', '86ba578b': '345', '6f4bd64e': '346', '1c178d24': '347', '6043a2b4': '348', '499edb7c': '349', 'c7fe2a55': '350', '907a054b': '351', 'd3f1e122': '352', '5f0eb72c': '353', '92687c59': '354', '9565bea6': '355', 'c0415e5c': '356', '7ad3efc6': '357', '4a09ace1': '358', '63f13dd7': '359', '49ed92e9': '360', '1f19558b': '361', '90ea0bac': '362', '8ac7cce4': '363', '160654fd': '364', '895865f3': '365', '7961e599': '366', '4e5fc6f5': '367', '9ce586dd': '368', '731c0cbe': '369', '9c5ef70c': '370', '8af75982': '371', 'd122731b': '372', '7ec0c298': '373', '56bcd38d': '374', '3dcdda7f': '375', 'bd612267': '376', 'd88ca108': '377', '46cd75b4': '378', 'c6971acf': '379', '792530f8': '380', '1cf54632': '381', 'a7640a16': '382', '5a848010': '383', 'df4fe8b6': '384', '27253bdc': '385'}
train['event_id'] = train['event_id'].map(eventid_map)
test['event_id'] = test['event_id'].map(eventid_map)
specs['event_id'] = specs['event_id'].map(eventid_map)
test_for_train['event_id'] = test_for_train['event_id'].map(eventid_map)

# In[ ]:


win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110


# In[ ]:


clip_length_map = {"Costume Box":61,
"12 Monkeys":109,
"Tree Top City - Level 2":25,
"Lifting Heavy Things":118,
"Crystal Caves - Level 3":19,
"Rulers":126,
"Ordering Spheres":61,
"Balancing Act":72,
"Welcome to Lost Lagoon!":19,
"Magma Peak - Level 1":20,
"Tree Top City - Level 1":17,
"Treasure Map":156,
"Heavy, Heavier, Heaviest":61,
"Crystal Caves - Level 2":24,
"Tree Top City - Level 3":26,
"Honey Cake":142,
"Magma Peak - Level 2":22,
"Pirate's Tale": 80,
"Crystal Caves - Level 1":18,
"Slop Problem":60}
clip_length_map = {activities_map[e]: clip_length_map[e] for e in clip_length_map}


# In[ ]:


start_time = pd.Timestamp('2019-01-01 00:00:00', tz='UTC').timestamp()
titles = train.title.unique()
titles_noclip = train.loc[train.type != 'Clip','title'].unique()
activities = train.type.unique()
worlds = train.world.unique()
event_codes = train.event_code.unique()
types= train.type.unique()
eventids = specs.event_id.unique()
# title_events = train['title_events'].unique()
drug_duration_events = set(specs.loc[specs['info'].str.contains('drag') & (specs['args'].str.contains('coordinate')) &(specs['args'].str.contains('duration')) ,'event_id'])


# ##  miscliking heatmap
heatmaps = {}
w = 500
h = 400
for e in types:
    if e == 'Clip':
        continue
    titles_ = train.loc[ train.type == e, :].title.unique()
    print(f'{e} has event4070 ', len(train.loc[(train['type'] == e)&(train['event_code'] == 4070), 'event_data'].map(lambda x: json.loads(x))))
    for i in titles_:
        df_tmp  =train.loc[(train['type'] == e)&(train['event_code'] == 4070)&(train['title']==i), 'event_data']
        df_tmp_test  =test.loc[(test['type'] == e)&(test['event_code'] == 4070)&(test['title']==i), 'event_data']
        
        print(f'{e} title{i} {4070} has', len(df_tmp))

        coordinates =df_tmp.map(lambda x: json.loads(x)['coordinates'])
        coordinates_test =  df_tmp_test.map(lambda x: json.loads(x)['coordinates'])
        
        x = (coordinates.map(lambda d:d['x']) / coordinates.map(lambda d:d['stage_width']) * w).map(np.floor).map(int)
        y = (coordinates.map(lambda d:d['y']) / coordinates.map(lambda d:d['stage_height'])* h).map(np.floor).map(int)
       
        x_test = (coordinates_test.map(lambda d:d['x']) / coordinates_test.map(lambda d:d['stage_width']) * w).map(np.floor).map(int)
        y_test = (coordinates_test.map(lambda d:d['y']) / coordinates_test.map(lambda d:d['stage_height'])* h).map(np.floor).map(int)        
        
        x = x.clip(0,w-1)
        y = y.clip(0,h -1)
        
        x_test = x_test.clip(0, w-1)
        y_test = y_test.clip(0,h -1)
        
        heatmap_ = np.ones((h, w))
        for y0, x0 in zip(x,y):
            heatmap_[x0,y0] += 1
        
        for y0, x0 in zip(x_test,y_test):
            heatmap_[x0,y0] += 1
        heatmaps[f'{i}_4070'] = 1 / heatmap_
#         heatmap_ = np.clip(heatmap_,0,50)
#         f , ax = plt.subplots(figsize = (14,12))
#         plt.title(f'{e} title{i} {4070} has {len(df_tmp)}')
#         sns.heatmap(heatmap_)
        try:
            del df_tmp, df_tmp_test, x_test, y_test
        except:
            pass
       


##

heatmaps4035 = {}
w2 = 250
h2 = 200
for e in types:
    if e == 'Clip':
        continue
    titles_ = train.loc[ train.type == e, :].title.unique()
    print(f'{e} has event4035 ', len(train.loc[(train['type'] == e)&(train['event_code'] == 4035), 'event_data'].map(lambda x: json.loads(x))))
    for i in titles_:
        df_tmp  =train.loc[(train['type'] == e)&(train['event_code'] == 4035)&(train['title']==i), 'event_data']
        df_tmp_test  =test.loc[(test['type'] == e)&(test['event_code'] == 4035)&(test['title']==i), 'event_data']
        
        print(f'{e} title{i} {4035} has', len(df_tmp))

        coordinates =df_tmp.map(lambda x: json.loads(x)['coordinates'])
        coordinates_test =  df_tmp_test.map(lambda x: json.loads(x)['coordinates'])
        
        x = (coordinates.map(lambda d:d['x']) / coordinates.map(lambda d:d['stage_width']) * w2).map(np.floor).map(int)
        y = (coordinates.map(lambda d:d['y']) / coordinates.map(lambda d:d['stage_height'])* h2).map(np.floor).map(int)
       
        x_test = (coordinates_test.map(lambda d:d['x']) / coordinates_test.map(lambda d:d['stage_width']) * w2).map(np.floor).map(int)
        y_test = (coordinates_test.map(lambda d:d['y']) / coordinates_test.map(lambda d:d['stage_height'])* h2).map(np.floor).map(int)        
        
        x = x.clip(0,w2-1)
        y = y.clip(0,h2 -1)
        
        x_test = x_test.clip(0, w2-1)
        y_test = y_test.clip(0,h2 -1)
        
        heatmap_ = np.ones((h2, w2))
        for y0, x0 in zip(x,y):
            heatmap_[x0,y0] += 1
        
        for y0, x0 in zip(x_test,y_test):
            heatmap_[x0,y0] += 1
        heatmaps4035[f'{i}_4035'] = 1 / heatmap_
#         heatmap_ = np.clip(heatmap_,0,20)
#         f , ax = plt.subplots(figsize = (14,12))
#         plt.title(f'{e} title{i} {4035} has {len(df_tmp)}')
#         sns.heatmap(heatmap_)
        try:
            del df_tmp, df_tmp_test, x_test, y_test
        except:
            pass
        
        
# In[ ]:
test.to_csv('test.csv', index = False)
del test
gc.collect()


train = pd.concat([train, test_for_train], axis= 0)
train.reset_index(drop = True, inplace = True)
del test_for_train
gc.collect()



train_41004110 = train.loc[((train.title == 11) & (train.event_code == 4110)) | ((train.title.isin([20, 21, 29, 41])) & (train.event_code == 4100)), ]
train_41004110['attemps'] = train_41004110['event_data'].str.contains('true') + train_41004110['event_data'].str.contains('false')
attemp_dict = train_41004110['attemps']  > 0 
train['attemps']  = pd.Series(train.index).map(attemp_dict).fillna(False)
del train_41004110, attemp_dict


# In[ ]:


train_41004110 = train.loc[((train.title == 11) & (train.event_code == 4110)) | ((train.title.isin([20, 21, 29, 41])) & (train.event_code == 4100)), ]
train_41004110['attemps'] = train_41004110['event_data'].str.contains('true') + train_41004110['event_data'].str.contains('false')
attemp_dict = train_41004110['attemps']  > 0 
train['attemps']  = pd.Series(train.index).map(attemp_dict).fillna(False)
train['attemps'] = train.groupby('game_session')['attemps'].transform('sum') >0 #如果是17690个中的 为TRUE
del train_41004110,attemp_dict




# In[ ]:



## FE AND TRAIN

In [None]:


# In[ ]:


##title_duration mean min max std
##title_bag,  like word bagging
##event_bagging,  like word bagging

##null importance fe selection
##recursive selection with cv and plot

test_code = False
if test_code:
    id_ = set(list(train.installation_id.unique())[300:600])
    train = train.loc[train.installation_id.isin(id_ ), :]
    
def session_setup(session, test_set,features):
    kid_id = session['installation_id'].iloc[0]
    session_type = session['type'].iloc[0]
    session_title = session['title'].iloc[0]
    session_world = session['world'].iloc[0]
    features['kid_id'] = kid_id
    features['game_session'] = session['game_session'].iloc[0]
    if test_set == True:
        second_condition = True
    else:
        if len(session)>1:
            second_condition = True
        else:
            second_condition= False
    return kid_id, session_type, session_title, session_world, second_condition, features

def append_all_assessments(all_assessments, features,session, test_set, counter):
    if test_set == True:
        all_assessments.append(features.copy())
    else:
        if session['attemps'].iloc[0]:
            all_assessments.append(features.copy())            
    counter += 1
    return all_assessments, features, counter

def compile_data(train, function,test_set =  False):
    compiled_data = []
    for i, (ins_id, user_sample) in enumerate(train.groupby('installation_id', sort=False)):
        if  test_set:
            compiled_data += [function(user_sample.copy(), test_set = test_set)]
        else:
            compiled_data += function(user_sample.copy(), test_set = test_set)
        del i, (ins_id, user_sample)
    return pd.DataFrame(compiled_data.copy())


In [None]:
# ### get_data_XXX (fuction)

# In[ ]:

def get_data_distraction(user_sample, test_set=False):
    all_assessments = []
    features = {}
    distraction_total = np.array([])
    distraction = {title: np.array([])  for title in  titles}  
    counter = 0    
    for i, session in user_sample.groupby('game_session', sort=False):
        kid_id, session_type, session_title, session_world, second_condition, features= session_setup(session, test_set, features)
        if (session_type == 'Assessment') & (second_condition):
            features['distraction_mean'] = np.mean(distraction_total)
            distraction_title_mean = {f'distraction_title{title_}': np.mean(distraction[title_]) for title_ in titles}
            distraction_title_count = {f'distraction_count_title{title_}': len(distraction[title_]) for title_ in titles}
            features.update(distraction_title_mean)
            features.update(distraction_title_count)    
            all_assessments, features, counter = append_all_assessments(all_assessments,  features, session,test_set, counter)
        
        if (session_type != 'Clip')& (second_condition):
            df_tmp  =session.loc[session['event_code'] == 4070, 'event_data']
            coordinates =df_tmp.map(lambda x: json.loads(x)['coordinates']).copy()
            x = (coordinates.map(lambda d:d['x']) / coordinates.map(lambda d:d['stage_width']) * w).map(np.floor).map(int)
            y = (coordinates.map(lambda d:d['y']) / coordinates.map(lambda d:d['stage_height'])* h).map(np.floor).map(int)        
            x = x.clip(0,w-1)
            y = y.clip(0,h-1)
            session_title_ = [session_title] * len(x)
            distraction_= np.array([heatmaps[str(session_title_2) + '_4070'][x_, y_] for x_, y_, session_title_2 in zip(y, x,session_title_)])
            distraction_total = np.concatenate([distraction_total, distraction_.copy()])
            distraction[session_title]  = np.concatenate([distraction[session_title], distraction_.copy()])            
            del df_tmp, coordinates, x, y,distraction_,session_title_
        del i, session
    del distraction_total,distraction, features
    if test_set:
        return all_assessments[-1]
    return all_assessments

def get_data_4035(user_sample, test_set=False):
    all_assessments = []
    features = {}
    distraction_total = np.array([])
    distraction = {title: np.array([])  for title in  titles}  
    counter = 0    
    for i, session in user_sample.groupby('game_session', sort=False):
        kid_id, session_type, session_title, session_world, second_condition, features= session_setup(session, test_set, features)
        if (session_type == 'Assessment') & (second_condition):
            features['4035_mean'] = np.mean(distraction_total)
            distraction_title_mean = {f'4035_title{title_}': np.mean(distraction[title_]) for title_ in titles}
            distraction_title_count = {f'4035_count_title{title_}': len(distraction[title_]) for title_ in titles}
            features.update(distraction_title_mean)
            features.update(distraction_title_count)    
            all_assessments, features, counter = append_all_assessments(all_assessments,  features, session,test_set, counter)
        
        if (session_type != 'Clip')& (second_condition):
            df_tmp  =session.loc[session['event_code'] == 4035, 'event_data']
            coordinates =df_tmp.map(lambda x: json.loads(x)['coordinates'])
            x = (coordinates.map(lambda d:d['x']) / coordinates.map(lambda d:d['stage_width']) * w2).map(np.floor).map(int)
            y = (coordinates.map(lambda d:d['y']) / coordinates.map(lambda d:d['stage_height'])* h2).map(np.floor).map(int)        
            x = x.clip(0,w2-1)
            y = y.clip(0,h2-1)
            session_title_ = [session_title] * len(x)
            distraction_= np.array([heatmaps4035[str(session_title_2) + '_4035'][x_, y_] for x_, y_, session_title_2 in zip(y, x,session_title_)])
            distraction_total = np.concatenate([distraction_total, distraction_])
            distraction[session_title]  = np.concatenate([distraction[session_title], distraction_])            
    
    if test_set:
        return all_assessments[-1]
    return all_assessments


def get_data_basic(user_sample, test_set=False):
    all_assessments = []
    features = {}
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    user_activities_count.update({'title_' + str(title): 0 for title in titles})
    user_activities_count.update({'event_' + str(event): 0 for event in event_codes})
    user_activities_count.update({'eventid_' + str(eventid): 0 for eventid in eventids})
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0    
    world_title = {'CRYSTALCAVES':set([34, 42, 40, 24, 37,  3, 39, 10, 43,  6, 29, 38, 30, 21, 18]),
    'MAGMAPEAK':set([28, 32,  4, 16,  0, 36, 15, 33, 12, 17, 41]), 
    'TREETOPCITY':set([2,  1,  8, 19, 14, 23,  9,  7,  5, 20, 13, 25, 22, 27, 26, 31, 11])}
    title_order = []
    for i, session in user_sample.groupby('game_session', sort=False):
        kid_id, session_type, session_title, session_world, second_condition, features= session_setup(session, test_set, features)
        if (session_type == 'Assessment') & (second_condition):
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            features.update(user_activities_count.copy())
            features['session_title'] = session['title'].iloc[0] 
            features['session_world'] = session['world'].iloc[0]
            features['nunique_title'] = len(set(title_order))
            features['nunique_title_in_this_world']  = len(set(title_order) & world_title[session_world])
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else -9
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2 # the assessment was solved on the second attempt
            else:
                features['accuracy_group'] = 1 # the assessment was solved after 3 or more attempts
            features.update(accuracy_groups.copy())
            features['accuracy'] = accuracy
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else -9
            features['accumulated_actions'] = accumulated_actions
            accumulated_accuracy_group += features['accuracy_group']
            accuracy_groups[features['accuracy_group']] += 1
            all_assessments, features, counter = append_all_assessments(all_assessments,  features, session,test_set, counter)
        
        dict_ = session['event_code'].value_counts().to_dict()
        dict_ =  {f'event_{str(e)}': user_activities_count[f'event_{str(e)}'] + dict_[e] for e in dict_}
        user_activities_count.update(dict_.copy())
        dict_ = session['event_id'].value_counts().to_dict()
        dict_ =  {f'eventid_{str(e)}': user_activities_count[f'eventid_{str(e)}'] + dict_[e] for e in dict_}
        user_activities_count.update(dict_.copy())      
        user_activities_count[session_type] += 1
        user_activities_count['title_' + str(session_title)] += 1
        accumulated_actions += len(session)
        
        title_order += [session_title]
    if test_set:
        return all_assessments[-1] 
    return all_assessments

def get_data_duration(user_sample, test_set=False):
    user_sample_copy  = user_sample.copy()
    user_sample_copy['time_till_next_action'] = -user_sample_copy['timestamp'].diff(-1)
    all_assessments = []
    features = {}
    clip_durations = []
    activity_durations = []
    assessment_durations = []
    game_durations = []
    title_duration = {title: []  for title in  titles}
    counter = 0    
    for i, session in user_sample_copy.groupby('game_session', sort=False):
        kid_id, session_type, session_title, session_world, second_condition, features= session_setup(session, test_set, features)
        sessoion_duration = np.clip(session.iloc[-1, 2] - session.iloc[0, 2],0,1000)
        if (session_type == 'Clip'):        
            sessoion_duration = session['time_till_next_action'].fillna(0).clip(0, clip_length_map[session_title]).sum()
            clip_durations.append(sessoion_duration)
        if (session_type == 'Game')& (second_condition):
            game_durations.append(session.iloc[-1, 2] - session.iloc[0, 2] )
        if (session_type == 'Activity')& (second_condition):
            activity_durations.append(session.iloc[-1, 2] - session.iloc[0, 2] )
        if (session_type == 'Assessment') & (second_condition):
            features['clip_duration_mean'] = pd.Series(clip_durations).mean()
            features['clip_duration_max'] = pd.Series(clip_durations).max()
            features['clip_duration_sum'] = pd.Series(clip_durations).sum() if  clip_durations != [] else 0   
            features['activity_duration_mean'] = pd.Series(activity_durations).mean()
            features['activity_duration_max'] = pd.Series(activity_durations).max()
            features['activity_duration_sum'] = pd.Series(activity_durations).sum()  if  activity_durations != [] else 0   
            features['assessment_duration_mean'] = pd.Series(assessment_durations).mean()
            features['assessment_duration_max'] = pd.Series(assessment_durations).max()
            features['assessment_duration_sum'] = pd.Series(assessment_durations).sum()  if  assessment_durations != [] else 0   
            features['game_duration_mean'] = pd.Series(game_durations).mean() 
            features['game_duration_max'] = pd.Series(game_durations).max() 
            features['game_duration_sum'] = pd.Series(game_durations).sum() if  game_durations != [] else 0
            title_duration_mean = {f'title{title}_duration_mean': np.mean(title_duration[title]) if title_duration[title] != [] else -9 for title in  titles}
            title_duration_max = {f'title{title}_duration_max': np.std(title_duration[title]) if title_duration[title] != [] else -9 for title in  titles}
            title_duration_sum = {f'title{title}_duration_sum': np.sum(title_duration[title]) if title_duration[title] != [] else 0 for title in  titles}
            features.update(title_duration_mean.copy())
            features.update(title_duration_max.copy())
            features.update(title_duration_sum.copy())
            #更新
            assessment_durations.append(np.clip(session.iloc[-1, 2] - session.iloc[0, 2], 0, 1000))
            all_assessments, features, counter = append_all_assessments(all_assessments,  features, session,test_set, counter)
        
        #record this title_duration for later use
        title_duration[session_title].append(sessoion_duration)
    if test_set:
        return all_assessments[-1] 
    return all_assessments

def get_data_correcy(user_sample, test_set=False):
    user_sample_copy  = user_sample.copy()
    user_sample_copy['time_till_next_action'] = -user_sample_copy['timestamp'].diff(-1)
    all_assessments = []
    features = {}
    correcy_assessment = []
    correcy_game =  []
    correcy_all =  []
    game_assessment = [0, 4, 8, 10, 11, 13, 17, 18, 20, 21, 27, 29, 33, 41, 42, 43]
    games =[0, 4, 8, 10, 13, 17, 18, 27, 33, 42, 43]
    assessments = [11, 20, 21, 29, 41]
    correcy_title = {e: [] for e in  game_assessment} #game assessment 
    counter = 0    
    for i, session in user_sample_copy.groupby('game_session', sort=False):
        kid_id, session_type, session_title, session_world, second_condition, features= session_setup(session, test_set, features)
        if (session_type == 'Assessment') & (second_condition):
            features['correcy_all_mean'] = np.mean(np.concatenate(correcy_all)) if correcy_all != [] else -1
            features['correcy_all_count'] = len(np.concatenate(correcy_all)) if correcy_all != [] else 0
            features['correcy_all_mean_mean'] =  np.mean([np.mean(e) for e in correcy_all if e != []])  
            features['correcy_all_mean_std'] =  np.mean([np.mean(e) for e in correcy_all if e != []])  
                
            features['correcy_game_mean'] = np.mean(np.concatenate(correcy_game)) if correcy_game != [] else -1
            features['correcy_game_count'] =  len(np.concatenate(correcy_game))   if correcy_game != [] else 0
            features['correcy_game_mean_mean'] =  np.mean([np.mean(e) for e in correcy_game if e != []])  
 
            
            
            features['correcy_assessment_mean'] = np.mean(np.concatenate(correcy_assessment)) if correcy_assessment != [] else -1
            features['correcy_assessment_count'] = len(np.concatenate(correcy_assessment))  if correcy_assessment != [] else 0
            features['correcy_assessment_mean_mean'] =  np.mean([np.mean(e) for e in correcy_assessment if e != []])  
            
            correcy_title_mean = {f'correcy_mean_title{title}': np.mean(np.concatenate(correcy_title[title])) if correcy_title[title] != [] else -9 for title in  assessments}
            correcy_title_count = {f'correcy_count_title{title}': len(np.concatenate(correcy_title[title]))  if correcy_title[title] != [] else 0 for title in  assessments}
            correcy_title_mean_mean = {f'correcy_mean_mean_title{title}': np.mean([np.mean(e) for e in correcy_title[title] if e != []]) for title in  assessments}
            correcy_title_max= {f'correcy_max_title{title}': np.max([np.mean(e) for e in correcy_title[title] if e != []])                                                                                             if len([np.mean(e) for e in correcy_title[title] if e != []]) != 0 else -9 
                                                                                            for title in  assessments}
            features.update(correcy_title_mean.copy())
            features.update(correcy_title_count.copy())
            features.update(correcy_title_mean_mean.copy())
            features.update(correcy_title_max.copy())
        
            features['assessment_tried_ratio'] = sum([0 if e == [] else 1 for e in correcy_assessment]) / len(correcy_assessment) if  len(correcy_assessment) !=0 else -9
            features['game_tried_ratio'] = sum([0 if e == [] else 1 for e in correcy_game]) / len(correcy_game)  if  len(correcy_game) !=0 else -9
            all_assessments, features, counter = append_all_assessments(all_assessments,  features, session,test_set, counter)
        if session_type in ['Assessment', 'Game']:
            correct = session.loc[:, 'event_data'].map(lambda x: json.loads(x)['correct'] if 'correct' in  json.loads(x) else -1)
            correct = list(correct[correct != -1].replace([True, False], [1, 0]))
            correcy_all.append(correct.copy())
            correcy_title[session_title].append(correct.copy())
            if session_type == 'Game': 
                correcy_game.append(correct.copy())
            else:
                correcy_assessment.append(correct.copy())
    if test_set:
        return all_assessments[-1] 
    return all_assessments

def get_data_lag(user_sample, test_set=False):
    all_assessments = []
    features = {}
    title_lasttime = {title: start_time  for title in  titles} #time since last this title
    title_acc_lasttime = {title: -1  for title in  titles}
    title_acc_group_lasttime = {title: -1  for title in  titles}
    title_attemp_lasttime = {title: -1  for title in  titles}
    world_name_lasttime = 'start'
    title_trace = [-1]   
    try:
        first_session_world = user_sample.loc[user_sample['world'] != 'NONE','world'].values[0]
    except:
        first_session_world = 'NONE'
    try:
        first_session_title = user_sample.loc[user_sample['title'] != 35,'title'].values[0]    
    except:
        first_session_title = 35  
    try:
        first_session_type = user_sample.loc[user_sample['title'] != 35,'type'].values[0]    
    except:
        first_session_type = 'Clip'  
    first_session_type = user_sample['type'].values[0]
    
    counter = 0    
    for i, session in user_sample.groupby('game_session', sort=False):
        kid_id, session_type, session_title, session_world, second_condition, features= session_setup(session, test_set, features)
        sessoion_duration =session.iloc[-1, 2] - session.iloc[0, 2]
        if (session_type == 'Assessment') & (second_condition):
            features['title_lasttime'] = session.iloc[0, 2] - title_lasttime[session_title]
            features['title_acc_lasttime'] = title_acc_lasttime[session_title]
            features['title_attemp_lasttime'] = title_attemp_lasttime[session_title]  
            features['title_name_lag2'] = f'{title_trace[-1]}_{session_title}'
            features['title_name_lag3'] = f'{title_trace[-2]}_{title_trace[-1]}_{session_title}' if len(title_trace)>=3 else 'NAN'
            features['title_name_lag4'] = f'{title_trace[-3]}_{title_trace[-2]}_{title_trace[-1]}_{session_title}' if len(title_trace)>=4 else 'NAN'
            features['title_name_lasttime'] = title_trace[-1]
            features['world_name_lasttime'] = world_name_lasttime
            features['first_session_world'] = first_session_world
            features['first_session_title'] = first_session_title
            features['first_session_type'] = first_session_type
            features['hour'] = pd.Timestamp.fromtimestamp(session['timestamp'].iloc[0]).hour
            features['dow'] = pd.Timestamp.fromtimestamp(session['timestamp'].iloc[0]).weekday()
            all_assessments, features, counter = append_all_assessments(all_assessments,  features, session,test_set, counter)
            
        if session_type == 'Assessment':
            correct = session.loc[:, 'event_data'].map(lambda x: json.loads(x)['correct'] if 'correct' in  json.loads(x) else -1)
            correct = list(correct[correct != -1].replace([True, False], [1, 0]))
            title_acc_lasttime[session_title] = np.mean(correct) if correct != [] else -9
            title_attemp_lasttime[session_title]  = len(correct)
        title_lasttime[session_title] = session.iloc[-1,2] 
        title_trace.append(session_title)           
        world_name_lasttime= session_world
    if test_set:
        return all_assessments[-1] 
    return all_assessments



def get_data_MissRoundLevel(user_sample, test_set=False):
    all_assessments = []
    features = {}
    games = [] # game with misses 
    assessments = []
    title_misses_sum = {f'title_misses_sum' + str(title): 0 for title in titles}
    title_misses_order = {f'title_misses_order' + str(title): [] for title in titles}
    title_misses_duration_order =  {f'title_misses_duration_order' + str(title): [] for title in titles}
    title_attemps_sum = {f'title_attemps_sum' + str(title): 0 for title in titles}
    title_rounds_sum = {f'title_rounds_sum' + str(title): 0 for title in titles}
    title_level_max = {f'title_level_max' + str(title): 0 for title in titles}
    counter = 0   
    def cnt_(df):
        cnt_miss = 0
        cnt_miss_order = []
        cnt_miss_duration =  []
        cnt_attmpts = 0
        cnt_rounds = 0
        cnt_level = 0
        for e in range(len(df)):
            x = df['event_data'].iloc[e]
            try:
                y = json.loads(x)['misses']
                cnt_miss_order.append(y)
                cnt_miss_duration.append(np.clip(json.loads(x)['duration'], 0, 300000))
                cnt_miss += y
            except:
                pass
            try:
                y = json.loads(x)['correct']
                cnt_attmpts += y
            except:
                pass
            try:
                y = json.loads(x)['round']
                cnt_rounds = np.max([cnt_rounds, y])
            except:
                pass
            try:
                y = json.loads(x)['level']
                cnt_level = np.max([cnt_level, y])
            except:
                pass
        return {'cnt_miss': cnt_miss,
                      'cnt_miss_order':cnt_miss_order,
                      'cnt_miss_duration': cnt_miss_duration,
                      'cnt_attmpts':cnt_attmpts,
                      'cnt_rounds':cnt_rounds, 
                      'cnt_level':cnt_level}
    for i, session in user_sample.groupby('game_session', sort=False):
        kid_id, session_type, session_title, session_world, second_condition, features= session_setup(session, test_set, features)

        if (session_type == 'Assessment') & (second_condition):
            features.update(title_misses_sum.copy())
            features.update( {e + '_mean': np.mean(title_misses_order[e]) for e in title_misses_order})
            features.update( {e + '_std': np.std(title_misses_order[e]) for e in title_misses_order})
            features.update( {e + '_mean': np.mean(title_misses_duration_order[e]) for e in title_misses_duration_order})
            features.update(title_attemps_sum.copy())
            features.update(title_rounds_sum.copy())
            features.update(title_level_max.copy())
            all_assessments, features, counter = append_all_assessments(all_assessments,  features, session,test_set, counter)          
        
        counts_ = cnt_(session)
        title_misses_sum[f'title_misses_sum' + str(session_title)] += counts_['cnt_miss']
        title_misses_order[f'title_misses_order' + str(session_title)] += (counts_['cnt_miss_order'])
        title_misses_duration_order[f'title_misses_duration_order' + str(session_title)] += (counts_['cnt_miss_duration'])
        title_attemps_sum[f'title_attemps_sum'  + str(session_title)] += counts_['cnt_attmpts']
        title_rounds_sum[f'title_rounds_sum'  + str(session_title)]+= counts_['cnt_rounds']
        title_level_max[f'title_level_max' + str(session_title)] = np.max([title_level_max[f'title_level_max' + str(session_title)], counts_['cnt_level']])
        
    if test_set:
        return all_assessments[-1] 
    return all_assessments

In [None]:
# ### compile train
# In[ ]:


In [None]:
if True:
    new_train_distraction = compile_data(train, get_data_distraction)
    new_train_4035 = compile_data(train, get_data_4035)
    new_train_basic = compile_data(train, get_data_basic)
    new_train_duration = compile_data(train, get_data_duration)
    new_train_correcy = compile_data(train, get_data_correcy)
    new_train_lag = compile_data(train, get_data_lag)
    new_train_MissRoundLevel = compile_data(train, get_data_MissRoundLevel)

    new_train_distraction.to_csv('new_train_distraction.csv', index = False)
    new_train_4035.to_csv('new_train_4035.csv', index = False)
    new_train_basic.to_csv('new_train_basic.csv', index = False)
    new_train_duration.to_csv('new_train_duration.csv', index = False)
    new_train_correcy.to_csv('new_train_correcy.csv', index = False)
    new_train_lag.to_csv('new_train_lag.csv', index = False)
    new_train_MissRoundLevel.to_csv('new_train_MissRoundLevel.csv', index = False)

    print(new_train_distraction.shape)
    print(new_train_basic.shape)
    print(new_train_duration.shape)
    print(new_train_correcy.shape)
    print(new_train_lag.shape)
    print(new_train_MissRoundLevel.shape)

## add more features (fuction)


In [None]:
# ### add more features (fuction)

# In[ ]:


assessments = [11, 20, 21, 29, 41]
games =[0, 4, 8, 10, 13, 17, 18, 27, 33, 42, 43]
#add more feasture
def add_more_feature(df, test = False):
    df['event_sum'] = df[['event_' + str(event) for event in event_codes]].sum(axis = 1)
    df['event_4070_ratio'] = (df['event_4070'] / df['event_sum']).replace(float('inf'), -9)
    df['123'] = df['1'] + df['2']+ df['3']
    df['123ratio'] = (df['123'] / (df['123'] + df['0'])).replace(float('inf'), -9)
    df['session_sum'] = df['Clip'] + df['Game'] + df['Activity']+ df['Assessment']
    
    for e in assessments:
        df['max_correcy_bin1_title' + str(e)] = np.where(df['correcy_max_title'  + str(e)] == 1, 1, 0) 
        df['max_correcy_bin2_title' + str(e)]= np.where(df['correcy_max_title'  + str(e)] > 0, 1, 0) 
        df['title_count_bin_title' + str(e)]= np.where(df['title_'  + str(e)] > 0, 1, 0) 
        df['assessments_speed_acc' + str(e)]= df['correcy_mean_mean_title21'] / df['title' + str(e)+ '_duration_mean']
        
    for e in games:
        df['speed_acc_' + str(e)] = new_train['title_misses_order'+str(e)+'_mean'] /new_train['title_misses_duration_order' +  str(e) + '_mean'] 
    df['assessments_speed_acc'] = new_train['correcy_assessment_mean_mean']/ new_train['assessment_duration_mean']
    df['game_speed_acc'] = new_train['correcy_game_mean_mean']/ new_train['game_duration_mean']
    print(df.shape)
    return df


# #

## concat new_train

In [None]:
# In[ ]:


if not on_kaggle:
    new_train_distraction= pd.read_csv('new_train_distraction.csv')
    new_train_4035 =  pd.read_csv('new_train_4035.csv')
    new_train_basic= pd.read_csv('new_train_basic.csv')
    new_train_duration= pd.read_csv('new_train_duration.csv')
    new_train_correcy= pd.read_csv('new_train_correcy.csv')
    new_train_lag= pd.read_csv('new_train_lag.csv')
    new_train_MissRoundLevel= pd.read_csv('new_train_MissRoundLevel.csv')
else:
    new_train_distraction = pd.read_csv('/kaggle/working/new_train_distraction.csv')
    new_train_4035 =  pd.read_csv('/kaggle/working/new_train_4035.csv')
    new_train_basic = pd.read_csv('/kaggle/working/new_train_basic.csv')
    new_train_duration = pd.read_csv('/kaggle/working/new_train_duration.csv')
    new_train_correcy = pd.read_csv('/kaggle/working/new_train_correcy.csv') 
    new_train_lag =  pd.read_csv('/kaggle/working/new_train_lag.csv')
    new_train_MissRoundLevel =  pd.read_csv('/kaggle/working/new_train_MissRoundLevel.csv')
    
new_train  =  pd.concat([new_train_distraction,
                   new_train_basic,
                   new_train_duration,
                   new_train_correcy,
                   new_train_lag,
                   new_train_MissRoundLevel,
                   new_train_4035], axis= 1)
print(new_train.shape)
new_train = new_train.loc[:,~new_train.columns.duplicated()]
print(new_train.shape)


# In[ ]:


new_train  =  pd.concat([new_train_distraction,
                   new_train_basic,
                   new_train_duration,
                   new_train_correcy,
                   new_train_lag,
                   new_train_MissRoundLevel,
                   new_train_4035], axis= 1)
print(new_train.shape)
new_train = new_train.loc[:,~new_train.columns.duplicated()]
print(new_train.shape)
new_train = add_more_feature(new_train)

new_train = new_train.fillna(-9)
try: 
    del train
except:
    pass
gc.collect()
try: 
    new_train.drop(columns = 'Unnamed: 0', inplace = True)
except:
    pass            
cats = ['session_world', 'first_session_world','first_session_type', 'title_name_lag2', 'title_name_lag3', 'title_name_lag4', 'world_name_lasttime']
les = {col:LabelEncoderExt() for col in cats}
for col in cats:
    les[col].fit(new_train[col])
    new_train[col] = les[col].transform(new_train[col])

for e in list(new_train.columns):
    if new_train[e].dtypes == np.float64:
        new_train[e] = new_train[e].astype(np.float32)
        
all_features = [x for x in new_train.columns if x not in ['accuracy_group', 'kid_id', 'game_session', 'accuracy']]
cat_features = ['first_session_world', 'first_session_title', 'first_session_type',
'session_title', 'session_world', 'title_name_lag2', 'title_name_lag3', 'title_name_lag4', 'hour', 'dow',
'world_name_lasttime', 'title_name_lasttime']
all_features_lgbm = all_features.copy()
cat_features_lgbm =  cat_features.copy()
all_features_catboost = all_features.copy()
cat_features_catboost  =  cat_features.copy()

## compile test

In [None]:
# #

# In[ ]:
test = pd.read_csv('test.csv')

if True:
#     print(cpu_stats())
    new_test_distraction = compile_data(test, get_data_distraction, test_set=True)
#     print(cpu_stats())
    new_test_4035 = compile_data(test, get_data_4035, test_set=True)
#     print(cpu_stats())
    new_test_basic = compile_data(test, get_data_basic, test_set=True)
#     print(cpu_stats())
    new_test_duration = compile_data(test, get_data_duration, test_set=True)
    new_test_correcy = compile_data(test, get_data_correcy, test_set=True)
    new_test_lag = compile_data(test, get_data_lag, test_set=True)
    new_test_MissRoundLevel = compile_data(test, get_data_MissRoundLevel, test_set=True)
    
    new_test_distraction.to_csv('new_test_distraction.csv', index = False)
    new_test_4035.to_csv('new_test_4035.csv', index = False)
    new_test_basic.to_csv('new_test_basic.csv', index = False)
    new_test_duration.to_csv('new_test_duration.csv', index = False)
    new_test_correcy.to_csv('new_test_correcy.csv', index = False)
    new_test_lag.to_csv('new_test_lag.csv', index = False)
    new_test_MissRoundLevel.to_csv('new_test_MissRoundLevel.csv', index = False)

## concat X_test

In [None]:
# ### concat X_test

# In[ ]:


new_test_distraction= pd.read_csv('new_test_distraction.csv')
new_test_4035= pd.read_csv('new_test_4035.csv')
new_test_basic= pd.read_csv('new_test_basic.csv')
new_test_duration= pd.read_csv('new_test_duration.csv')
new_test_correcy= pd.read_csv('new_test_correcy.csv')
new_test_lag= pd.read_csv('new_test_lag.csv')
new_test_MissRoundLevel= pd.read_csv('new_test_MissRoundLevel.csv')
new_test  =  pd.concat([new_test_distraction,
                   new_test_basic,
                   new_test_duration,
                   new_test_correcy,
                   new_test_lag,
                   new_test_MissRoundLevel,
                   new_test_4035], axis= 1)
new_test = new_test.loc[:,~new_test.columns.duplicated()]
print(new_test.shape)


# In[ ]:


new_test  =  pd.concat([new_test_distraction,
                   new_test_basic,
                   new_test_duration,
                   new_test_correcy,
                   new_test_lag,
                   new_test_MissRoundLevel,
                   new_test_4035], axis= 1)
new_test = new_test.loc[:,~new_test.columns.duplicated()]
print(new_test.shape)

new_test = add_more_feature(new_test, test = True)
new_test = new_test.fillna(-9)
try: 
    del test
except:
    pass
if  'Unnamed: 0' in new_test.columns:
    new_test.drop(columns = 'Unnamed: 0', inplace = True)
for col in cats:
    new_test[col] = les[col].transform(new_test[col])   
for e in list(new_test.columns):
    if new_test[e].dtypes == np.float64:
        new_test[e] = new_test[e].astype(np.float32)
        


# In[ ]:


cols = ['title_name_lag2']
for e in cols:
    dict_ = pd.concat([new_test[e] ,new_train[e]]).value_counts().to_dict()
    new_train[f'{e}_freq'] = new_train[e].map(dict_)
    new_test[f'{e}_freq'] = new_test[e].map(dict_)
all_features += [f'{e}_freq' for e in cols]

if_skip = set([119, 187, 116, 180 , 47])
if_repeat = set([14, 63, 69, 108, 171])
new_train['if_skip'] = np.where(new_train['title_name_lag2'].isin(if_skip).values,1,0)
new_train['if_repeat'] = np.where(new_train['title_name_lag2'].isin(if_repeat).values,1,0)
new_test['if_skip'] = np.where(new_test['title_name_lag2'].isin(if_skip).values,1,0)
new_test['if_repeat'] = np.where(new_test['title_name_lag2'].isin(if_repeat).values,1,0)

new_train['if_skip'].value_counts()
new_train['if_repeat'].value_counts()
print(new_test['if_skip'].value_counts())
print(new_test['if_repeat'].value_counts())


all_features += ['if_skip', 'if_repeat']
['title_name_lag2_freq', 'if_skip', 'if_repeat']


# In[ ]:


X, y = new_train[all_features], new_train['accuracy_group']        
X_test = new_test[all_features].copy()



# In[ ]:

In [None]:
new_train_nn = new_train.copy()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler, StandardScaler

## NN model

In [None]:
all_features =['session_title', 'world_name_lasttime', 'eventid_188', 'title_acc_lasttime', 'correcy_all_mean_mean', 'session_world', 'correcy_all_mean_std', 'eventid_247', 'event_2000', 'Clip', 'accumulated_accuracy_group', 'eventid_385', 'eventid_343', 'eventid_113', 'session_sum', 'correcy_mean_title20', 'eventid_73', 'correcy_max_title20', 'correcy_mean_title11', 'eventid_317', 'accumulated_accuracy', 'title_misses_order21_std', '123ratio', 'correcy_max_title11', '4035_count_title31', 'correcy_mean_mean_title11', 'accumulated_uncorrect_attempts', 'title_name_lag2', 'eventid_134', 'eventid_154', 'correcy_assessment_mean', 'event_4070_ratio', 'distraction_count_title42', 'eventid_351', 'eventid_24', 'title_attemps_sum21', 'title_attemps_sum11', 'correcy_game_mean_mean', 'game_tried_ratio', 'title_misses_duration_order21_mean', 'correcy_mean_title21', 'eventid_54', 'title_14', 'nunique_title', 'correcy_assessment_mean_mean', 'event_2010', 'title34_duration_mean', 'eventid_86', 'correcy_mean_title41', 'correcy_all_mean', 'title42_duration_sum', 'eventid_283', 'correcy_mean_mean_title20', 'eventid_165', 'Game', 'eventid_337', '0', 'correcy_game_mean', 'eventid_101', 'title_misses_order41_mean', 'title34_duration_sum', 'eventid_215', 'event_4010', 'eventid_218', 'distraction_title41', 'clip_duration_sum', 'event_2035', 'eventid_305', 'title_7', 'eventid_296', 'distraction_title4', 'title_4', 'correcy_mean_mean_title29', 'correcy_max_title29', 'correcy_mean_title29', 'event_4070', 'event_3121', 'title1_duration_sum', 'eventid_304', 'title32_duration_max', 'correcy_mean_mean_title21', 'eventid_278', 'title22_duration_sum', 'eventid_185', 'title24_duration_sum', 'eventid_14', 'title_24', '4035_count_title8', 'event_4031', 'eventid_349', 'eventid_326', 'eventid_350', 'eventid_217', 'eventid_15', 'event_3120', 'eventid_309', 'eventid_72', 'nunique_title_in_this_world', 'title_misses_order10_mean', 'title41_duration_sum', 'title_name_lasttime', 'game_duration_max', 'distraction_mean', 'distraction_title17', 'eventid_212', 'title_rounds_sum4', 'title0_duration_mean', 'title0_duration_sum', 'event_3110', 'title_34', 'distraction_count_title41', 'title_misses_order27_mean', 'eventid_216', 'title_misses_order18_std', 'eventid_242', 'title_misses_duration_order8_mean', 'title21_duration_mean', 'correcy_count_title41', 'title4_duration_max', 'title_37', 'eventid_109', 'title7_duration_sum', 'eventid_380', 'title_misses_sum27', 'eventid_297', 'eventid_299', 'eventid_56', 'event_4020', 'distraction_count_title20', 'title8_duration_sum', 'eventid_175', 'eventid_219', 'eventid_112', 'title26_duration_sum', 'eventid_2', 'title_misses_order8_mean', 'correcy_max_title21', 'title21_duration_sum', 'Activity', 'title_9', 'eventid_194', 'title_1', 'title_misses_duration_order41_mean', 'title_35', 'title8_duration_mean', 'eventid_222', 'event_4090', 'event_3021', 'eventid_237', 'event_4025', 'accumulated_actions', 'title40_duration_mean', 'eventid_174', 'eventid_340', 'distraction_title7', 'eventid_249', 'event_2040', 'title_name_lag3', 'eventid_302', 'event_3020', 'eventid_192', 'correcy_count_title21', 'eventid_311', 'eventid_103', 'distraction_count_title12', 'title42_duration_max', 'eventid_140', 'eventid_120', 'eventid_277', 'event_4030', 'title9_duration_sum', 'title_misses_order42_std', 'distraction_count_title31', 'eventid_375', 'event_3010', 'event_4035', 'title35_duration_sum', 'eventid_177', 'event_2020', 'eventid_9', 'title34_duration_max', 'title_misses_order41_std', 'distraction_title14', 'title_41', 'eventid_282', 'title43_duration_mean', 'title4_duration_sum', 'title8_duration_max', 'distraction_title0', 'distraction_count_title30', 'eventid_220', 'title42_duration_mean', 'title_6', 'title28_duration_max', 'title_misses_order4_mean', 'title9_duration_max', 'title_misses_sum42', 'eventid_125', 'eventid_92', 'title_misses_order43_mean', 'title_lasttime', 'title_misses_sum41', 'eventid_383', 'activity_duration_mean']
all_features += ['title_name_lag2_freq', 'max_correcy_bin1_title11',
 'max_correcy_bin2_title11',
 'title_count_bin_title11',
 'max_correcy_bin1_title20',
 'max_correcy_bin2_title20',
 'title_count_bin_title20',
 'max_correcy_bin1_title21',
 'max_correcy_bin2_title21',
 'title_count_bin_title21',
 'max_correcy_bin1_title29',
 'max_correcy_bin2_title29',
 'title_count_bin_title29',
 'max_correcy_bin1_title41',
 'max_correcy_bin2_title41',
 'title_count_bin_title41','if_skip', 'if_repeat']
cat_features = ['first_session_world', 'first_session_title', 'first_session_type',
'session_title', 'session_world', 'hour', 'dow', 'title_name_lag2', 'title_name_lag3',
'world_name_lasttime', 'title_name_lasttime']
all_features = [e for e in all_features if e not in cat_features]
cat_features = ['session_title', 'session_world']
num_features = [e for e in all_features if e not in cat_features]

print(len(all_features))
print(all_features)
print(cat_features)

In [None]:
# In[ ]:


def num_feature_process(df, num_features):
    df_copy = df.copy()
    negtive9_array = np.array([])
    negtive1_array = np.array([])
    invalid_features = []
    for e in num_features:
        # -9缺失 
        if np.isin(-9, df_copy[e].values):
            print(e)
            if len(negtive9_array) == 0:
                negtive9_array = np.where(df_copy[e] == -9, 1,0).reshape(-1,1)
            else:
                negtive9_array = np.concatenate([negtive9_array, np.where(df_copy[e] == -9, 1,0).reshape(-1,1)],axis = 1)

        # -1缺失 
        if np.isin(-1, df_copy[e].values):
            if len(negtive1_array) == 0:
                negtive1_array = np.where(df_copy[e] == -1, 1,0).reshape(-1,1)
            else:
                negtive1_array = np.concatenate([negtive9_array, np.where(df_copy[e] == -1, 1,0).reshape(-1,1)],axis = 1)

        #填补缺失
        df_copy[e][(df_copy[e] < 0 )] = -9   
        _mean = df_copy[e][(df_copy[e] > 0 )].mean()
        df_copy[e] = df_copy[e].replace(-9, 0)
#         df_copy[e] = df_copy[e].replace(-9, _mean)
        df_copy[e] = np.log1p(df_copy[e])    
        if df_copy[e].nunique() in [1, 0]:
            invalid_features.append(e)
    X_num = df_copy[[e for e in num_features if e not in invalid_features]].values
    del df_copy
    return X_num, negtive9_array, negtive1_array


# cat_features 
def cat_feature_preprocess(df, cat_features, test_set = False, les2= None, onehot_dict2 = None):
    #label enconding
    if not test_set:
        les2 = {col:LabelEncoderExt() for col in cat_features}
        for e in cat_features:
            df[e]= df[e].map(str)
        for col in cat_features:
            les2[col].fit(df[col])
            df[col] = les2[col].transform(df[col])
        #one-hot
        onehot_dict2 = {cat_col: OneHotEncoder(n_values= df[cat_col].nunique(), sparse= False,  handle_unknown='ignore') for cat_col in cat_features}
        X_cat = np.array([])
        for e in cat_features:
            print(e)
            onehot_dict2[e].fit(np.array(df[e]).reshape(-1,1))
            if len(X_cat) == 0:
                X_cat = onehot_dict2[e].transform(np.array(df[e]).reshape(-1,1))
            else:
                X_cat = np.concatenate([X_cat, onehot_dict2[e].transform(np.array(df[e]).reshape(-1,1))] , axis = 1)    
        return X_cat, les2, onehot_dict2
    if test_set:
        for e in cat_features:
            df[e]= df[e].map(str)
        for col in cat_features:
            df[col] = les2[col].transform(df[col])
        X_cat = np.array([])
        for e in cat_features:
            if len(X_cat) == 0:
                X_cat = onehot_dict2[e].transform(np.array(df[e]).reshape(-1,1))
            else:
                X_cat = np.concatenate([X_cat, onehot_dict2[e].transform(np.array(df[e]).reshape(-1,1))] , axis = 1)            
        return X_cat
        
    
X_num, negtive9_array, negtive1_array  = num_feature_process(new_train, num_features) 
X_cat, les2, onehot_dict2 = cat_feature_preprocess(new_train_nn, cat_features, test_set = False)    

print(negtive9_array.shape)
print(negtive1_array.shape)
print(X_cat.shape)
print(X_num.shape)
sds = StandardScaler(copy=True, with_mean=False, with_std=True)
sds.fit(X_num)
X = np.concatenate([sds.transform(X_num), X_cat, negtive1_array, negtive9_array], axis = 1)
y = new_train['accuracy_group'].values
y2 = np.sqrt(new_train['accuracy']) *3
print(X.shape)
print(y.shape)
print(y2.shape)


# In[ ]:


test_num, test_negtive9_array, test_negtive1_array  = num_feature_process(X_test, num_features) 
test_cat= cat_feature_preprocess(X_test, cat_features, test_set = True, les2= les2, onehot_dict2 = onehot_dict2 )  
X_test_array = np.concatenate([sds.transform(test_num), test_cat, test_negtive1_array, test_negtive9_array], axis = 1)
print(X_test_array.shape)


# In[ ]:


from keras.layers import LSTM, Dropout, Input, Dense, concatenate,Flatten, BatchNormalization, Softmax, Add,LeakyReLU
from keras.models import Model,Sequential,load_model
from keras import optimizers
from keras_tqdm import TQDMNotebookCallback
from sklearn.model_selection import train_test_split


def NN_model():
    Input_main = Input(shape=((X.shape[1],)), name="main")
    main_input = Dense(256)(Input_main)
    main_input = LeakyReLU(alpha=0.3)(main_input)
    main_input = BatchNormalization()(main_input)
    main_input = Dropout(0.3)(main_input) 
    
    main_input = Dense(256)(main_input)
    main_input = LeakyReLU(alpha=0.3)(main_input)
    main_input = BatchNormalization()(main_input)   
    main_input = Dropout(0.3)(main_input)    

    main_input = Dense(256)(main_input)    
    main_input = LeakyReLU(alpha=0.3)(main_input)
    main_input = BatchNormalization()(main_input)
    main_input = Dropout(0.3)(main_input)   


    output = Dense(1)(main_input)
    output2 = Dense(1)(main_input)
    adam = optimizers.Adam(learning_rate=0.0003, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model = Model(inputs=[Input_main], outputs=[output, output2])
    model.compile(optimizer=adam, loss=['mean_squared_error','mean_squared_error'])
    return model

model = NN_model()
model.summary()


# In[ ]:


from sklearn.model_selection import KFold, GroupKFold
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras
oof = np.zeros(len(X))
NFOLDS = 5
folds = GroupKFold(n_splits=NFOLDS)


weights = 1/ (new_train.groupby('kid_id')['Clip'].transform('count')).values
weights = weights/ np.sum(weights) *len(new_train)
models = []
for fold, (tr_index, val_index) in enumerate(folds.split(list(range(X.shape[0])), groups = new_train_nn['kid_id'])):
    print(f'Training on fold {fold+1}')
    model = NN_model()
    tr_index_copy = tr_index.copy()
    for n in range(1):

#         new_train['index_'] =  list(new_train.index)
#         index_ = new_train.loc[tr_index_copy,:].groupby(['kid_id', 'session_title', 'accuracy_group'], as_index = False)['index_']\
#             .apply(lambda x:x.sample(8) if len(x) >= 8 else x)  
#         tr_index = sorted(list(index_))
#         print(len(tr_index))
#         new_train.drop(columns='index_',inplace= True)
        
        X_tr, X_val = X[tr_index, :], X[val_index, :]

        y_tr, y_val  = y[tr_index], y[val_index]
        y2_tr, y2_val = y2[tr_index], y2[val_index]
#         y3_tr, y3_val = y3[tr_index], y3[val_index]
        model_checkpoint = ModelCheckpoint("model_" + str(fold) + ".hdf5",
                                           save_best_only=True, verbose=1, monitor='val_loss', mode='min')
        early_stopping = EarlyStopping(monitor='val_loss', patience=15, verbose=2)
#         lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.8, patience=3, verbose=0, mode='min', min_delta=0.00001)
        def lr_decay(index_):
            if index_ < 15:
                return 0.0003
            elif  index_ < 30:
                if  index_ % 2 ==0:
                    return 0.00008
                else:
                    return 0.0002                    
            elif  index_ < 40:
                if  index_ % 2 ==0:
                    return 0.00008
                else:
                    return 0.00003          
            else:
                return 0.00003
        lr = keras.callbacks.LearningRateScheduler(lr_decay)
        model.fit([X_tr], [y_tr, y2_tr], batch_size = 128,
                  validation_data  = ([X_val], [y_val,y2_val], 
                  [weights[val_index],weights[val_index]]), 
                  epochs=60,
                            verbose=2, callbacks=[model_checkpoint, lr])
#         model = load_model("model_" + str(fold) + ".hdf5")    
        models.append(model)
        oof[val_index] +=model.predict([X_val])[0].flatten()
        oof[val_index] += model.predict([X_val])[1].flatten()
#         oof[val_index] += model.predict(X_val)[2].flatten()
    
from scipy import stats
#weighted validation
weights = 1/ (new_train.groupby('kid_id')['Clip'].transform('count'))
from scipy import stats
cum = (train_labels['accuracy_group'].value_counts(sort=False)/ len(train_labels['accuracy_group'])).cumsum()
print(cum)
pred_percentile = np.array([stats.percentileofscore(oof, a) for a in oof]) /100
bins_percentile = [0] + list(cum)
preds = np.digitize(pred_percentile, bins_percentile, right=True) -1
print('-' * 30)
print('OOF QWK:', qwk(y, preds, weights = weights.values))
print('OOF QWK:', qwk(y, preds))
print('-' * 30)


# In[ ]:

In [None]:
from scipy import stats
#weighted validation
weights = 1/ (new_train.groupby('kid_id')['Clip'].transform('count'))
from scipy import stats
cum = (train_labels['accuracy_group'].value_counts(sort=False)/ len(train_labels['accuracy_group'])).cumsum()
print(cum)
pred_percentile = np.array([stats.percentileofscore(oof, a) for a in oof]) /100
bins_percentile = [0] + list(cum)
preds = np.digitize(pred_percentile, bins_percentile, right=True) -1
print('-' * 30)
print('OOF QWK:', qwk(y, preds, weights = weights.values))
print('OOF QWK:', qwk(y, preds))
print('-' * 30)


# In[ ]:

In [None]:
from scipy import stats
#weighted validation
weights = 1/ (new_train.groupby('kid_id')['Clip'].transform('count'))
from scipy import stats
cum = (train_labels['accuracy_group'].value_counts(sort=False)/ len(train_labels['accuracy_group'])).cumsum()
print(cum)

cum[2] += 0.05
pred_percentile = np.array([stats.percentileofscore(oof, a) for a in oof]) /100
bins_percentile = [0] + list(cum)
preds = np.digitize(pred_percentile, bins_percentile, right=True) -1
print('-' * 30)
print('OOF QWK:', qwk(y, preds, weights = weights.values))
print('OOF QWK:', qwk(y, preds))
print('-' * 30)

In [None]:
# In[ ]:


for e in [20,29,41 ,11 ,21 ]:
    print(f'{e}: ##############')
    index_ = list(new_train.loc[(new_train.session_title == e),:].index)
    print(pd.DataFrame(confusion_matrix(preds[index_], y[[index_]])))

In [None]:
from functools import partial
import scipy as sp
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y,  initial_coef = None):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        if initial_coef == None:
            initial_coef = init_threshold
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']
    
cum = (train_labels['accuracy_group'].value_counts(sort=False)/ len(train_labels['accuracy_group'])).cumsum()
init_threshold  = [ sorted(oof)[int(round(len(oof) * cum[0]))],  sorted(oof)[int(round(len(oof) * cum[1]))], sorted(oof)[int(round(len(oof) * cum[2]))]]

cut_df = pd.DataFrame()
qwk_1,qwk_2,coefficients_list = [],[],[]
for i in range(25):
#     initial_coef_ = [0.4 + np.random.sample()* 0.6, 1.4 + np.random.sample()* 0.6, 2 + np.random.sample()* 0.4]
    initial_coef_ = init_threshold.copy()
    initial_coef_[0] += (np.random.sample() - 0.5)* 0.1 
    initial_coef_[1] += (np.random.sample() - 0.5)* 0.1 
    initial_coef_[2] += (np.random.sample() - 0.5)* 0.1 
    optR = OptimizedRounder()
    optR.fit(oof.reshape(-1,), y, initial_coef = initial_coef_)
    coefficients = optR.coefficients()
    preds = oof.copy()
    preds[preds <coefficients[0]] = 0
    preds[(preds >coefficients[0])&(preds <coefficients[1]) ] = 1
    preds[(preds >coefficients[1])&(preds <coefficients[2]) ] =2
    preds[(preds >coefficients[2])] = 3
    coefficients_list.append(coefficients)
    qwk_1.append(qwk(new_train['accuracy_group'], preds, weights = weights.values))
    qwk_2.append(qwk(new_train['accuracy_group'], preds))
cut_df['qwk1'] = qwk_1
cut_df['qwk2'] =qwk_2
cut_df['coefficients_list'] = coefficients_list

cut_df['qwk3'] = cut_df['qwk1'] + cut_df['qwk2'] * 0.2
cut_df = cut_df.sort_values('qwk3', ascending= False)
coefficients = cut_df.iloc[0,2]
cut_df.head(5)


# In[ ]:


s1 = np.sum((train_labels['accuracy_group'] == 0) * weights) *  (len(oof)) /sum(weights)
s2 =np.sum((train_labels['accuracy_group'] == 1) * weights) *  (len(oof)) /sum(weights)
s3 =np.sum((train_labels['accuracy_group'] == 2) * weights) *  (len(oof)) /sum(weights)
s4 =np.sum((train_labels['accuracy_group'] == 3) * weights) *  (len(oof)) /sum(weights)
cum_weighted = [0]*int(round(s1)) +  [1]*int(round(s2))+  [2]*int(round(s3))+  [3]*int(round(s4))
for e in range(20):
    try:
        print('###########################best ' +  str(e) )
        print('##########################')
        plt.hist(np.digitize(oof,   [-99] + list(cut_df.iloc[e,2]) + [99], right=True) -1, align = 'right')
        plt.hist(cum_weighted,  align='mid')
        plt.hist(np.digitize(oof,   [-99] + init_threshold + [99], right=True) -1, align='left' )
        cum_ =  (pd.Series(np.digitize(oof,   [-99] + list(cut_df.iloc[e,2]) + [99], right=True) -1).value_counts(sort=False) / len(oof)).cumsum()
        print('qwk1 ' + str(round(cut_df.iloc[e,0],3)) +  ', qwk2 '+  str(round(cut_df.iloc[e,1],3)))
        print('cum:', (pd.Series(np.digitize(oof,   [-99] + list(cut_df.iloc[e,2]) + [99], right=True) -1).value_counts(sort=False) / len(oof)).cumsum())
        plt.legend(['opt', 'weighted_train', 'train'])
        plt.figure(figsize=(2,1))
        plt.show()
    except:
        pass


# In[ ]:


cum = (pd.Series(np.digitize(oof,   [-99] + list(cut_df.iloc[0,2]) + [99], right=True) -1).value_counts(sort=False) / len(oof)).cumsum()
cum[3] += 0.5
print(cum)
pred_percentile = np.array([stats.percentileofscore(oof, a) for a in oof]) /100
bins_percentile = [0] + list(cum)
preds = np.digitize(pred_percentile, bins_percentile, right=True) -1
print('-' * 30)
print('OOF QWK:', qwk(y, preds, weights = weights.values))

## train model on all data once

In [None]:
# train once
def lr_decay(index_):
    if index_ < 15:
        return 0.0003
    elif  index_ < 30:
        if  index_ % 2 ==0:
            return 0.00008
        else:
            return 0.0002                    
    elif  index_ < 40:
        if  index_ % 2 ==0:
            return 0.00008
        else:
            return 0.00003          
    else:
        return 0.00003
Nmodel = 9
lr = keras.callbacks.LearningRateScheduler(lr_decay)
init_seeds(SEED+1)
model1 = NN_model()
model1.fit(X, [y, y2], batch_size = 128 , epochs=63,verbose=5, callbacks=[lr])
init_seeds(SEED+2)
model2 = NN_model()
model2.fit(X, [y, y2], batch_size = 128 , epochs=65,verbose=5, callbacks=[lr])
init_seeds(SEED+3)
model3 = NN_model()
model3.fit(X, [y, y2], batch_size = 128 , epochs=68,verbose=5, callbacks=[lr])

init_seeds(SEED+1000)
model1_seed1 = NN_model()
model1_seed1.fit(X, [y, y2], batch_size = 128 , epochs=63,verbose=5, callbacks=[lr])
init_seeds(SEED+1001)
model2_seed1 = NN_model()
model2_seed1.fit(X, [y, y2], batch_size = 128 , epochs=65,verbose=5, callbacks=[lr])
init_seeds(SEED+1002)
model3_seed1 = NN_model()
model3_seed1.fit(X, [y, y2], batch_size = 128 , epochs=68,verbose=5, callbacks=[lr])

init_seeds(SEED+2000)
model1_seed2 = NN_model()
model1_seed2.fit(X, [y, y2], batch_size = 128 , epochs=63,verbose=5, callbacks=[lr])
init_seeds(SEED+2001)
model2_seed2 = NN_model()
model2_seed2.fit(X, [y, y2], batch_size = 128 , epochs=65,verbose=5, callbacks=[lr])
init_seeds(SEED+2002)
model3_seed2 = NN_model()
model3_seed2.fit(X, [y, y2], batch_size = 128 , epochs=68,verbose=5, callbacks=[lr])


# In[ ]:


preds_nn_raw  = model1.predict(X_test_array)[0].flatten()
preds_nn_raw  += model1.predict(X_test_array)[1].flatten()
preds_nn_raw  += model2.predict(X_test_array)[0].flatten()
preds_nn_raw  += model2.predict(X_test_array)[1].flatten()
preds_nn_raw  += model3.predict(X_test_array)[0].flatten()
preds_nn_raw  += model3.predict(X_test_array)[1].flatten()
preds_nn_raw  += model1_seed1.predict(X_test_array)[0].flatten()
preds_nn_raw  += model1_seed1.predict(X_test_array)[1].flatten()
preds_nn_raw  += model2_seed1.predict(X_test_array)[0].flatten()
preds_nn_raw  += model2_seed1.predict(X_test_array)[1].flatten()
preds_nn_raw  += model3_seed1.predict(X_test_array)[0].flatten()
preds_nn_raw  += model3_seed1.predict(X_test_array)[1].flatten()
preds_nn_raw  += model1_seed2.predict(X_test_array)[0].flatten()
preds_nn_raw  += model1_seed2.predict(X_test_array)[1].flatten()
preds_nn_raw  += model2_seed2.predict(X_test_array)[0].flatten()
preds_nn_raw  += model2_seed2.predict(X_test_array)[1].flatten()
preds_nn_raw  += model3_seed2.predict(X_test_array)[0].flatten()
preds_nn_raw  += model3_seed2.predict(X_test_array)[1].flatten()


# from scipy import stats
# preds = preds_nn_raw
# cum = (train_labels['accuracy_group'].value_counts(sort=False)/ len(train_labels['accuracy_group'])).cumsum()
# pred_percentile = np.array([stats.percentileofscore(preds, a) for a in preds]) /100
# bins_percentile = [0] + list(cum)
# print(cum)
# preds = np.digitize(pred_percentile, bins_percentile, right=True) -1
# preds_nn = preds.copy()

from scipy import stats
preds = preds_nn_raw.copy()
preds = preds / Nmodel
preds = np.digitize(preds,   [-99] + list(cut_df.iloc[0,2]) + [99], right=True) -1
preds_nn = preds.copy()


# In[ ]:


plt.hist(preds_nn_raw)

## Make submission

In [None]:
# ## Make submission

# In[ ]:


submission['accuracy_group'] = np.round(preds_nn).astype('int')
submission.to_csv('submission.csv', index=None)
submission.head()


# In[ ]:


submission['accuracy_group'].plot(kind='hist')


# In[ ]:


train_labels['accuracy_group'].plot(kind='hist')


# In[ ]:


pd.Series(oof).plot(kind='hist')


# In[ ]: