In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostRegressor
from matplotlib import pyplot
import shap
import random
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from typing import List, Optional, Union, Tuple, Dict
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
import gc
import json
pd.set_option('display.max_columns', 1000)

# Objective

* In the last notebook we create our baseline model including a feature selection part. 
* Cohen cappa score of 0.456 (lb) with a local cv score of 0.529
* In this notebook we are going to add more features and remove others that i think they overfitt the train set and then check if our local cv score improve.
* Next, we will check if this improvement aligns with the lb.

# Notes
* Check the distribution of the target variable of the out of folds score and the prediction distribution. A good model should more or less have the same distribution.

In [None]:
def RandGroupKfold(groups, n_splits, random_state=None, shuffle_groups=False):

    ix = np.array(range(len(groups)))
    unique_groups = np.unique(groups)
    if shuffle_groups:
        prng = RandomState(random_state)
        prng.shuffle(unique_groups)
    splits = np.array_split(unique_groups, n_splits)
    train_test_indices = []

    for split in splits:
        mask = [el in split for el in groups]
        train = ix[np.invert(mask)]
        test = ix[mask]
        train_test_indices.append((train, test))
    return train_test_indices

In [None]:
def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(reduce_train['accuracy_group'])
    for k in dist:
        dist[k] /= len(reduce_train)
    reduce_train['accuracy_group'].hist()
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred))).reshape(y_true.shape)

    return 'cappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True

In [None]:
def cohenkappa(ypred, y):
    y = y.get_label().astype("int")
    ypred = ypred.reshape((4, -1)).argmax(axis = 0)
    loss = cohenkappascore(y, y_pred, weights = 'quadratic')
    return "cappa", loss, True

In [None]:
def read_data():
    print('Reading train.csv file....')
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    return train, test, train_labels, specs, sample_submission

In [None]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])

    train['hour'] = train['timestamp'].dt.hour
    test['hour'] = test['timestamp'].dt.hour
    
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code,activities_map

In [None]:
clip_time = {'Welcome to Lost Lagoon!':19,'Tree Top City - Level 1':17,'Ordering Spheres':61, 'Costume Box':61,
        '12 Monkeys':109,'Tree Top City - Level 2':25, 'Pirate\'s Tale':80, 'Treasure Map':156,'Tree Top City - Level 3':26,
        'Rulers':126, 'Magma Peak - Level 1':20, 'Slop Problem':60, 'Magma Peak - Level 2':22, 'Crystal Caves - Level 1':18,
        'Balancing Act':72, 'Lifting Heavy Things':118,'Crystal Caves - Level 2':24, 'Honey Cake':142, 'Crystal Caves - Level 3':19,
        'Heavy, Heavier, Heaviest':61}

In [None]:
# this is the function that convert the raw data into processed features

def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    assess_4020_acc_dict = {
            'Cauldron Filler (Assessment)_4020_accuracy': 0,
                            'Mushroom Sorter (Assessment)_4020_accuracy': 0,
                            'Bird Measurer (Assessment)_4020_accuracy': 0,
                            'Chest Sorter (Assessment)_4020_accuracy': 0}
    
    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    
    Activity_mean_event_count = 0
    accumulated_game_miss = 0
    mean_game_round = 0
    sum_game_round = 0
    max_game_round = 0
    
    mean_game_duration = 0
    sum_game_duration = 0
    max_game_duration = 0
    
    mean_game_level = 0
    sum_game_level = 0
    max_game_level = 0
    
    Cauldron_Filler_4025 = 0
    chest_assessment_uncorrect_sum = 0
    Assessment_mean_event_count = 0
    
    
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    clip_durations = []
    Activity_durations = []
    Game_durations = []
    
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
    title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()} 
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
        
    # last features
    sessions_count = 0
    
    def get_4020_acc(df, counter_dict):
        for e in ['Cauldron Filler (Assessment)', 'Bird Measurer (Assessment)',
                  'Mushroom Sorter (Assessment)', 'Chest Sorter (Assessment)']:
            Assess_4020 = df[(df.event_code == 4020) & (df.title == activities_map[e])]
            true_attempts_ = Assess_4020['event_data'].str.contains('true').sum()
            false_attempts_ = Assess_4020['event_data'].str.contains('false').sum()

            measure_assess_accuracy_ = true_attempts_ / (true_attempts_ + false_attempts_) if (
                                                                                                      true_attempts_ + false_attempts_) != 0 else 0
            counter_dict[e + "_4020_accuracy"] += (counter_dict[e + "_4020_accuracy"] + measure_assess_accuracy_) / 2.0

        return counter_dict
    
    def cnt_miss(df):
            cnt = 0
            for e in range(len(df)):
                x = df['event_data'].iloc[e]
                y = json.loads(x)['misses']
                cnt += y
            return cnt
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
                    
        if session_type == 'Clip':
            clip_durations.append((clip_time[activities_labels[session_title]]))
        
        if session_type == 'Activity':
            Activity_durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            Activity_mean_event_count = (Activity_mean_event_count + session['event_count'].iloc[-1])/2.0
            
        if session_type == 'Game':
            Game_durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            game_s = session[session.event_code==2030]
            miss_count = cnt_miss(game_s)
            accumulated_game_miss += miss_count
            
            try:
                    game_round = json.loads(session['event_data'].iloc[-1])['round']
                    mean_game_round = (mean_game_round + game_round)/2.0
                    sum_game_round = sum_game_round + game_round
                    max_game_round = max(max_game_round,game_round)
            except:
                pass
            try:
                    game_duration = json.loads(session['event_data'].iloc[-1])['duration']
                    mean_game_duration = (mean_game_duration + game_duration)/2.0
                    sum_game_duration = sum_game_duration + game_duration
                    max_game_duration = max(max_game_duration,game_duration)
            except:
                pass
            try:
                    game_level = json.loads(session['event_data'].iloc[-1])['level']
                    mean_game_level = (mean_game_level + game_level) /2.0
                    sum_game_level = sum_game_level + game_level
                    max_game_level = max(max_game_level,game_level)
            except:
                pass
            
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(event_id_count.copy())
            features.update(title_count.copy())
            features.update(title_event_code_count.copy())
            features.update(last_accuracy_title.copy())
            
            features.update(assess_4020_acc_dict.copy())
            
            features['installation_session_count'] = sessions_count
            features['hour'] = session['hour'].iloc[-1]
            
            variety_features = [('var_event_code', event_code_count),
                              ('var_event_id', event_id_count),
                               ('var_title', title_count),
                               ('var_title_event_code', title_event_code_count)]
            
            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                features[name] = np.count_nonzero(arr)
                 
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0]
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            
            features['accumulated_uncorrect_over_total_attempt'] = accumulated_uncorrect_attempts/(accumulated_correct_attempts+accumulated_uncorrect_attempts) if (accumulated_correct_attempts+accumulated_uncorrect_attempts) !=0 else 0
            features['accumulated_correct_over_total_attempt'] = accumulated_correct_attempts/(accumulated_correct_attempts+accumulated_uncorrect_attempts) if (accumulated_correct_attempts+accumulated_uncorrect_attempts) !=0 else 0
            
            
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            
            features['Activity_mean_event_count'] = Activity_mean_event_count
            
            features['accumulated_game_miss'] = accumulated_game_miss
            features['mean_game_round'] = mean_game_round
            features['sum_game_round'] = sum_game_round
            features['max_game_round'] = max_game_round
            
            features['mean_game_duration'] = mean_game_duration
            features['sum_game_duration'] = sum_game_duration
            features['max_game_duration'] = max_game_duration
            
            features['mean_game_level'] = mean_game_level
            features['sum_game_level'] = sum_game_level
            features['max_game_level'] = max_game_level
            
            features['chest_assessment_uncorrect_sum'] = chest_assessment_uncorrect_sum
            features['Assessment_mean_event_count'] = Assessment_mean_event_count
            features['accumulated_game_miss'] = accumulated_game_miss

            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['duration_max'] = 0
                features['duration_min'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['duration_max'] = np.max(durations)
                features['duration_min'] = np.min(durations)
                
            if clip_durations == []:
                features['Clip_duration_mean'] = 0
                features['Clip_duration_std'] = 0
                features['Clip_duration_max'] = 0
                features['Clip_duration_min'] = 0
            else:
                features['Clip_duration_mean'] = np.mean(clip_durations)
                features['Clip_duration_std'] = np.std(clip_durations)
                features['Clip_duration_max'] = np.max(clip_durations)
                features['Clip_duration_min'] = np.min(clip_durations)                
            if Activity_durations == []:
                features['Activity_duration_mean'] = 0
                features['Activity_duration_std'] = 0
                features['Activity_duration_max'] = 0
                features['Activity_duration_min'] = 0
            else:
                features['Activity_duration_mean'] = np.mean(Activity_durations)
                features['Activity_duration_std'] = np.std(Activity_durations)
                features['Activity_duration_max'] = np.max(Activity_durations)
                features['Acitivity_duration_min'] = np.min(Activity_durations)
                
            if Game_durations == []:
                features['Game_duration_mean'] = 0
                features['Game_duration_std'] = 0
                features['Game_duration_max'] = 0
                features['Game_duration_min'] = 0
            else:
                features['Game_duration_mean'] = np.mean(Game_durations)
                features['Game_duration_std'] = np.std(Game_durations)
                features['Game_duration_max'] = np.max(Game_durations)
                features['Game_duration_min'] = np.min(Game_durations)
                
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            
            features['Couldron_Filler_4025'] = Cauldron_Filler_4025 / counter if counter > 0 else 0
                
            Assess_4025 = session[(session.event_code == 4025) & (session.title == 'Cauldron Filler (Assessment)')]
            true_attempts_ = Assess_4025['event_data'].str.contains('true').sum()
            false_attempts_ = Assess_4025['event_data'].str.contains('false').sum()
                
            cau_assess_accuracy_ = true_attempts_ / (true_attempts_ + false_attempts_) if (
                                                                                            true_attempts_ + false_attempts_) != 0 else 0
            Cauldron_Filler_4025 += cau_assess_accuracy_
            
            chest_assessment_uncorrect_sum += len(session[session.event_id == "df4fe8b6"])
            Assessment_mean_event_count = (Assessment_mean_event_count + session['event_count'].iloc[-1]) / 2.0
                
                
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        sessions_count += 1
        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    counter[x] += num_of_session_count[k]
                return counter
            
            
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')
        #assess_4020_acc_dict = get_4020_acc(session, assess_4020_acc_dict)

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type 
                        
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [None]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort = False)), total = 17000):
        compiled_train += get_data(user_sample)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), total = 1000):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    categoricals = ['session_title']
    return reduce_train, reduce_test, categoricals

In [None]:
def get_random_assessment(reduce_train):
    used_idx = []
    for iid in tqdm(set(reduce_train['installation_id'])):
        list_ = list(reduce_train[reduce_train['installation_id']==iid].index)
        cur = random.choices(list_, k = 1)[0]
        used_idx.append(cur)
    reduce_train_t = reduce_train.loc[used_idx]
    #print(reduce_train_t.groupby(['accuracy_group']).installation_id.count())
    return reduce_train_t, used_idx
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, groups, random_state, categoricals=[],fi=False, n_splits=5, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv(groups=groups,random_state=random_state)
        self.verbose = verbose
        self.fi = fi
        self.params = self.get_params()
        self.y_pred,self.oof_pred,self.oof_true, self.score, self.model,self.feature_importances = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self,groups,random_state):
        #cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        splits = RandGroupKfold(groups, self.n_splits, random_state=random_state, shuffle_groups=False)
        return list(splits)
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        feature_importances = np.zeros((len(self.features),))
        oof_pred = np.zeros((len(reduce_train), ))
        oof_pred_trunc = np.zeros((len(reduce_train), ))
        y_pred = np.zeros((len(reduce_test), ))
        score = 0
        ind = []
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            #sorted(list(set(self.features).union(set(['installation_id']))))
            x_train, x_val = self.train_df[sorted(list(set(self.features).union(set(['installation_id']))))].iloc[train_idx], self.train_df[sorted(list(set(self.features).union(set(['installation_id']))))].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            
            x_val_trunc, idx_val = get_random_assessment(x_val)
            ind.extend(idx_val)
            y_val_trunc = y_val.loc[idx_val]
            print(Counter(y_val_trunc))
            
            x_val_trunc.drop('installation_id', inplace = True, axis = 1)
            x_val.drop('installation_id', inplace = True, axis = 1)
            x_train.drop('installation_id', inplace = True, axis = 1)
            
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val_trunc, y_val_trunc)
            model = self.train_model(train_set, val_set)
            conv_x_val_trunc = self.convert_x(x_val_trunc)
            conv_x_val = self.convert_x(x_val)
            oof_pred_trunc[idx_val] = model.predict(conv_x_val_trunc).reshape(oof_pred[idx_val].shape)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[sorted(self.features)])
            #x_test.drop('installation_id', inplace = True, axis = 1)
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            if self.fi==True:
                feature_importances += model.feature_importance(importance_type="gain")
            score += eval_qwk_lgb_regr(y_val_trunc, oof_pred[idx_val])[1] / self.n_splits
            print('Partial truncated score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val_trunc, oof_pred_trunc[idx_val])[1]))
            print('Partial non-truncated score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val, oof_pred[val_idx])[1]))
        _, loss_score, _ = eval_qwk_lgb_regr(self.train_df[self.target], oof_pred)
        if self.verbose:
            print('Our oof truncated cohen kappa score is: ', score)
            print('Our oof non truncated cohen kappa score is: ', loss_score)
        return y_pred,oof_pred,self.train_df[self.target], loss_score, model,feature_importances

In [None]:
class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
        val_set = lgb.Dataset(x_val, y_val, categorical_feature=self.categoricals)
        return train_set, val_set
        
    def get_params(self):
        params = {'n_estimators':5000,
                    'boosting_type': 'gbdt',
                    'objective': 'regression',
                    'metric': 'rmse',
                    'subsample': 0.85,
                    'random_state': 27,
                    'subsample_freq': 1,
                    'learning_rate': 0.02,
                    'feature_fraction': 0.9,
                    'max_depth': 30,
                    #'lambda_l1': 1,  
                    #'lambda_l2': 1,
                    'early_stopping_rounds': 200
                    }
        return params

In [None]:
class Xgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return xgb.train(self.params, train_set, 
                         num_boost_round=5000, evals=[(train_set, 'train'), (val_set, 'val')], 
                         verbose_eval=verbosity, early_stopping_rounds=300)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set
    
    def convert_x(self, x):

        return xgb.DMatrix(x)
        
    def get_params(self):
        params = {'colsample_bytree': 0.85,                 
            'learning_rate': 0.03,
            'max_depth': 6,
            'subsample': 1,
            'seed': 7,
            'objective':'reg:squarederror',
            #'eval_metric':'rmse',
            #'min_child_weight':2,
            #'gamma':0.25,
            'n_estimators':5000}

        return params

In [None]:
class Catb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        clf = CatBoostRegressor(**self.params)
        clf.fit(train_set['X'], 
                train_set['y'], 
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity, 
                cat_features=self.categoricals)
        return clf
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        params = {'loss_function': 'RMSE',
                   'task_type': "CPU",
                   'iterations': 5000,
                   'od_type': "Iter",
                    'depth': 7,
                  'colsample_bylevel': 0.5, 
                   'early_stopping_rounds': 300,
                    'l2_leaf_reg': 18,
                   'random_seed': 42,
                    'use_best_model': True
                    }
        return params

In [None]:
import tensorflow as tf
import keras.backend as K
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from keras.callbacks import *
from keras.optimizers import *
from keras import Model
class MyCustomCallback(tf.keras.callbacks.Callback):
    def __init__(self,validation_data,model_name):
        self.x = validation_data[0]
        self.y = validation_data[1]
        self.score =0
        self.model_name = model_name
    '''
    def on_train_batch_begin(self, batch, logs=None):
        return
    def on_train_batch_end(self, batch, logs=None):
        pred = self.model.predict(self.x)
        cur_score = eval_qwk_lgb_regr(self.y, pred)[1]
        print('\r update best {} batch score is: {}'.format(batch,cur_score))
        if self.score<cur_score:
            self.model.save_weights(self.model_name)
            self.score = cur_score
        else:
            self.model.load_weights(self.model_name)
        return
    '''
    def on_train_begin(self,log=None):
        return
    def on_train_end(self,log=None):
        pred = self.model.predict(self.x)
        cur_score = eval_qwk_lgb_regr(self.y, pred)[1]
        print('\r train score is: {}'.format(cur_score))
        if self.score<cur_score:
            self.model.save_weights(self.model_name)
            self.score = cur_score
        else:
            self.model.load_weights(self.model_name)
        return
    
    def on_epoch_begin(self,epoch, logs=None):
        return

    def on_epoch_end(self,epoch, logs=None):
        pred = self.model.predict(self.x)
        cur_score = eval_qwk_lgb_regr(self.y, pred)[1]
        if self.score<cur_score:
            print('\r update best {} epoch score is: {}'.format(epoch,cur_score))
            self.model.save_weights(self.model_name)
            self.score = cur_score
        return
        
def swish(x):
    return K.sigmoid(x) * x
def mish(x):
        return x * K.tanh(K.softplus(x))

class Nn_Model(Base_Model):
    
    def __init__(self, train_df, test_df, features,groups,random_state, categoricals=[], fi=False,n_splits=5, verbose=True):
        features = features.copy()
        if len(categoricals) > 0:
            for cat in categoricals:
                enc = OneHotEncoder()
                train_cats = enc.fit_transform(train_df[[cat]])
                test_cats = enc.transform(test_df[[cat]])
                cat_cols = ['{}_{}'.format(cat, str(col)) for col in enc.active_features_]
                features += cat_cols
                train_cats = pd.DataFrame(train_cats.toarray(), columns=cat_cols)
                test_cats = pd.DataFrame(test_cats.toarray(), columns=cat_cols)
                train_df = pd.concat([train_df, train_cats], axis=1)
                test_df = pd.concat([test_df, test_cats], axis=1)
        scalar = MinMaxScaler()
        train_df[features] = scalar.fit_transform(train_df[features])
        test_df[features] = scalar.transform(test_df[features])
        print(train_df[features].shape)
        super().__init__(train_df, test_df, features,groups,random_state, categoricals,fi, n_splits, verbose)
        
    def train_model(self, train_set, val_set):
        early_stop = EarlyStopping(monitor='val_loss',
                                  min_delta=0,
                                  patience=15,
                                  mode='auto')
        verbosity = 100 if self.verbose else 0
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(train_set['X'].shape[1],)),
            tf.keras.layers.Dense(200, activation=mish),
            #GroupNormalization(),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Dense(100, activation=mish),
            tf.keras.layers.LayerNormalization(),
            #GroupNormalization(),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Dense(50, activation=mish),
            tf.keras.layers.LayerNormalization(),
            #GroupNormalization(),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Dense(25, activation=mish),
            tf.keras.layers.LayerNormalization(),
            #GroupNormalization(),
            tf.keras.layers.Dropout(0.4),
            tf.keras.layers.Dense(1, activation=mish)
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4), loss='mse')
        #print(model.summary())
        #save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        #early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'], 
                train_set['y'], 
                validation_data=(val_set['X'], val_set['y']),
                epochs=100,
                callbacks=[MyCustomCallback(validation_data=(val_set['X'], val_set['y']),
                                             model_name=f'nn_model1.w8'),
                                           early_stop])
        model.load_weights('nn_model1.w8')
        return model
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        return None

In [None]:
gc.collect()

In [None]:
# read data
train, test, train_labels, specs, sample_submission = read_data()
# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code,activities_map = encode_title(train, test, train_labels)
# tranform function to get the train and test set
reduce_train, reduce_test, categoricals = get_train_and_test(train, test)

In [None]:
reduce_train.columns = reduce_train.columns.astype(str)
reduce_test.columns = reduce_test.columns.astype(str)

In [None]:
def stract_hists(feature, train=reduce_train, test=reduce_test, adjust=False, plot=False):
    n_bins = 10
    train_data = train[feature]
    test_data = test[feature]
    if adjust:
        test_data *= train_data.mean() / test_data.mean()
    perc_90 = np.percentile(train_data, 95)
    train_data = np.clip(train_data, 0, perc_90)
    test_data = np.clip(test_data, 0, perc_90)
    train_hist = np.histogram(train_data, bins=n_bins)[0] / len(train_data)
    test_hist = np.histogram(test_data, bins=n_bins)[0] / len(test_data)
    msre = mean_squared_error(train_hist, test_hist)
    if plot:
        print(msre)
        plt.bar(range(n_bins), train_hist, color='blue', alpha=0.5)
        plt.bar(range(n_bins), test_hist, color='red', alpha=0.5)
        plt.show()
    return msre
stract_hists('Magma Peak - Level 1_2000', adjust=False, plot=True)

In [None]:
# call feature engineering function
features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
features = [x for x in features if x not in ['accuracy_group', 'installation_id']]

In [None]:
import gc
gc.collect()

In [None]:
reduce_train = pd.read_feather('reduce_train.ftr')
reduce_test = pd.read_feather('reduce_test.ftr')

In [None]:
to_exclude = [] 
ajusted_test = reduce_test.copy()

for feature in ajusted_test.columns:
    if feature not in ['accuracy_group', 'installation_id', 'accuracy_group', 'session_title']:
        data = reduce_train[feature]
        train_mean = data.mean()
        data = ajusted_test[feature] 
        test_mean = data.mean()
        try:
            error = stract_hists(feature, adjust=True)
            ajust_factor = train_mean / test_mean
            if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                to_exclude.append(feature)
                print(feature, train_mean, test_mean, error)
            else:
                ajusted_test[feature] *= ajust_factor
        except:
            to_exclude.append(feature)
            print(feature, train_mean, test_mean)

In [None]:
features = [x for x in features if x not in (to_exclude)]
reduce_train[features].shape

In [None]:
#fi_model = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=1, categoricals=categoricals)

In [None]:
'''
feature_imp = pd.DataFrame(
    sorted(zip(fi_model.model.feature_importance(importance_type="gain"), features)),
    columns=["value", "feature"])
feature_imp = feature_imp.sort_values(by = 'value',ascending=False)
feature_imp = feature_imp.reset_index()
'''

In [None]:
'''
feature_imp.head(30)
'''

In [None]:
#features11 = sorted(list(set(feature_imp.iloc[5:][feature_imp.iloc[5:].index %  4!= 0].feature).union(set(feature_imp.iloc[:5].feature))))
#features22 = sorted(list(set(feature_imp.iloc[5:][feature_imp.iloc[5:].index %  4!= 1].feature).union(set(feature_imp.iloc[:5].feature))))
#features33 = sorted(list(set(feature_imp.iloc[5:][feature_imp.iloc[5:].index %  4!= 2].feature).union(set(feature_imp.iloc[:5].feature))))
#features44 = sorted(list(set(feature_imp.iloc[5:][feature_imp.iloc[5:].index %  4!= 3].feature).union(set(feature_imp.iloc[:5].feature))))

In [None]:
import gc
gc.collect()

In [None]:
def select_uncorrelated_features(reduce_train, features):
    counter = 0
    to_remove1 = []
    to_remove2 = []
    lists= []
    for feat_a in features:
        if feat_a not in lists:
            for feat_b in features:
                if feat_a != feat_b and feat_b not in lists:
                    c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
                    if c >= 0.8:
                        counter += 1
                        to_remove1.append(feat_a)
                        to_remove2.append(feat_b)
                        lists.append(feat_a)
                        lists.append(feat_b)
                        print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))
                        break
    return sorted(to_remove1),sorted(to_remove2)

In [None]:
#to_seperate1,to_seperate2 = select_uncorrelated_features(reduce_train, features)

In [None]:
#to_seperate11,to_seperate12 = select_uncorrelated_features(reduce_train, to_seperate1)
#to_seperate21,to_seperate22 = select_uncorrelated_features(reduce_train, to_seperate2)

In [None]:
#print(len(features11))
#print(len(features22))
#print(len(features33))
#print(len(features44))
#print(len(features))

In [None]:
#features1 = [feat for feat in features if feat in to_seperate11]
#features2 = [feat for feat in features if feat in to_seperate12]
#features3 = [feat for feat in features if feat in to_seperate21]
#features4 = [feat for feat in features if feat in to_seperate22]
#features11 = sorted(list(set(features1).union(set(['session_title']))))
#features22 = sorted(list(set(features2).union(set(['session_title']))))
#features33 = sorted(list(set(features3).union(set(['session_title']))))
#features44 = sorted(list(set(features4).union(set(['session_title']))))
#features11 = sorted(list(set(features1).union(set(features2)).union(set(features3)).union(set(['session_title']))))
#features22 = sorted(list(set(features2).union(set(features3)).union(set(features4)).union(set(['session_title']))))
#features33 = sorted(list(set(features1).union(set(features2)).union(set(features4)).union(set(['session_title']))))
#features44 = sorted(list(set(features1).union(set(features3)).union(set(features4)).union(set(['session_title']))))

In [None]:
#cat_model = Catb_Model(reduce_train, ajusted_test, features, categoricals=categoricals)
#lgb_model1 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=5, categoricals=categoricals)
#lgb_model2 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=16, categoricals=categoricals)
#lgb_model3 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=27, categoricals=categoricals)
#lgb_model4 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=31, categoricals=categoricals)
#gc.collect()
#xgb_model1 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=35, categoricals=categoricals)
#xgb_model2 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=41, categoricals=categoricals)
#xgb_model3 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=46, categoricals=categoricals)
#xgb_model4 = Lgb_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=51, categoricals=categoricals)
gc.collect()
nn_model1 = Nn_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=9, categoricals=categoricals)
nn_model2 = Nn_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=10, categoricals=categoricals)
nn_model3 = Nn_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=11, categoricals=categoricals)
nn_model4 = Nn_Model(reduce_train, ajusted_test, features,groups=reduce_train['installation_id'],random_state=12, categoricals=categoricals)
#gc.collect()

In [None]:
#cnn_model = Cnn_Model(reduce_train, ajusted_test, features, categoricals=categoricals)
#nn_model = Nn_Model(reduce_train, ajusted_test, features, categoricals=categoricals)

In [None]:
np.corrcoef([nn_model1.oof_pred,nn_model2.oof_pred,nn_model3.oof_pred,nn_model4.oof_pred])#lgb_model1.oof_pred,lgb_model2.oof_pred,lgb_model3.oof_pred,lgb_model4.oof_pred
            #,xgb_model1.oof_pred,xgb_model2.oof_pred,xgb_model3.oof_pred,xgb_model4.oof_pred])
            

In [None]:
np.corrcoef([nn_model1.y_pred,nn_model2.y_pred,nn_model3.y_pred,nn_model4.y_pred])#lgb_model1.y_pred,lgb_model2.y_pred,lgb_model3.y_pred,lgb_model4.y_pred
            #,xgb_model1.y_pred,xgb_model2.y_pred,xgb_model3.y_pred,xgb_model4.y_pred])
            

In [None]:
np.corrcoef([nn_model1.oof_true,nn_model2.oof_true,nn_model3.oof_true,nn_model4.oof_true])#lgb_model1.oof_true,lgb_model2.oof_true,lgb_model3.oof_true,lgb_model4.oof_true
            #,xgb_model1.oof_true,xgb_model2.oof_true,xgb_model3.oof_true,xgb_model4.oof_true])
            #,nn_model1.oof_true,nn_model2.oof_true,nn_model3.oof_true,nn_model4.oof_true])

In [None]:
model_lists = ['nn1','nn2','nn3','nn4']#'lgb1','lgb2','lgb3','lgb4',
              #'xgb1','xgb2','xgb3','xgb4']#,
              #'nn1','nn2','nn3','nn4',]

In [None]:
import seaborn as sns
from sklearn.linear_model import RidgeCV
def ridgecv_predict():
    PRINT_CORR_HEATMAP=True
    PRINT_RIDGE_WEIGHTS = True
    RIDGE_ALPHAS = (0.1, 1.0, 10.0)
    #X = np.array([oof_preds1,oof_preds2,oof_preds3,oof_preds4,oof_preds5,oof_preds6,oof_preds7,oof_preds8]).T
    X = np.array([nn_model1.oof_pred,nn_model2.oof_pred,nn_model3.oof_pred,nn_model4.oof_pred]).T#lgb_model1.oof_pred,lgb_model2.oof_pred,lgb_model3.oof_pred,lgb_model4.oof_pred
            #,xgb_model1.oof_pred,xgb_model2.oof_pred,xgb_model3.oof_pred,xgb_model4.oof_pred]).T
            #,nn_model1.oof_pred,nn_model2.oof_pred,nn_model3.oof_pred,nn_model4.oof_pred]).T
    y = np.array([nn_model1.oof_true,nn_model2.oof_true,nn_model3.oof_true,nn_model4.oof_true]).mean(axis=0).T#lgb_model1.oof_true,lgb_model2.oof_true,lgb_model3.oof_true,lgb_model4.oof_true
            #,xgb_model1.oof_true,xgb_model2.oof_true,xgb_model3.oof_true,xgb_model4.oof_true]).mean(axis=0).T
         #,nn_model1.oof_true,nn_model2.oof_true,nn_model3.oof_true,nn_model4.oof_true]).mean(axis=0).T
    #if PRINT_CORR_HEATMAP:
        #sns_plot = sns.heatmap(pd.concat([X, y], axis=1).corr(), annot=True)
        #sns_plot.savefig("corr_w_gt.png")

    reg = RidgeCV(alphas = RIDGE_ALPHAS, normalize=True).fit(X, y)
    if PRINT_RIDGE_WEIGHTS:
        print("## Ridge Coefficients")
        print(f'Sum of coefficients: {sum(reg.coef_)}')
        for ww, ss in zip(reg.coef_, model_lists):
            print(f'{ss} has weight {ww:.5f}')
    #X = subs.iloc[:, :len(submission_paths)]
    #X = prepare_X(X)
    #y_pred = reg.predict(X)
    #y_pred = y_pred.T[0]
    #y_pred = np.clip(y_pred, 0, None)
    #y_pred = np.expm1(y_pred)
    return reg.coef_

In [None]:
coeff = ridgecv_predict()

In [None]:
#weights = {'lbg': 0.80, 'cat': 0, 'xgb': 0.20, 'nn': 0.00}

#final_pred = (lgb_model.y_pred * weights['lbg']) + (xgb_model.y_pred * weights['xgb']) + (nn_model.y_pred * weights['nn'])
#final_pred = cnn_model.y_pred


In [None]:
#pd.DataFrame([(round(a, 2), round(b, 2), round(c, 2), round(d, 2)) for a, b, c, d in zip(lgb_model.y_pred, cat_model.y_pred, xgb_model.y_pred, nn_model.y_pred)], columns=['lgb', 'cat', 'xgb', 'nn']).head(50)

In [None]:
oof_final_pred = coeff[0]*nn_model1.oof_pred+coeff[1]*nn_model2.oof_pred+coeff[2]*nn_model3.oof_pred+coeff[3]*nn_model4.oof_pred#coeff[0]*lgb_model1.oof_pred+coeff[1]*lgb_model2.oof_pred+coeff[2]*lgb_model3.oof_pred+coeff[3]*lgb_model4.oof_pred+\
            #coeff[4]*xgb_model1.oof_pred+coeff[5]*xgb_model2.oof_pred+coeff[6]*xgb_model3.oof_pred+coeff[7]*xgb_model4.oof_pred#+\
            #coeff[8]*nn_model1.oof_pred+coeff[9]*nn_model1.oof_pred+coeff[10]*nn_model1.oof_pred+coeff[11]*nn_model1.oof_pred
#print(final_pred.shape)

In [None]:
np.corrcoef([nn_model1.oof_pred,nn_model2.oof_pred,nn_model3.oof_pred,nn_model4.oof_pred,oof_final_pred ])#lgb_model1.oof_pred,lgb_model2.oof_pred,lgb_model3.oof_pred,lgb_model4.oof_pred
            #,xgb_model1.oof_pred,xgb_model2.oof_pred,xgb_model3.oof_pred,xgb_model4.oof_pred])
            #,nn_model1.oof_pred,nn_model2.oof_pred,nn_model3.oof_pred,nn_model4.oof_pred,oof_final_pred ])

In [None]:
final_pred = coeff[0]*nn_model1.y_pred+coeff[1]*nn_model2.y_pred+coeff[2]*nn_model3.y_pred+coeff[3]*nn_model4.y_pred#coeff[0]*lgb_model1.y_pred+coeff[1]*lgb_model2.y_pred+coeff[2]*lgb_model3.y_pred+coeff[3]*lgb_model4.y_pred+\
            #coeff[4]*xgb_model1.y_pred+coeff[5]*xgb_model2.y_pred+coeff[6]*xgb_model3.y_pred+coeff[7]*xgb_model4.y_pred#+\
            #coeff[8]*nn_model1.y_pred+coeff[9]*nn_model2.y_pred+coeff[10]*nn_model3.y_pred+coeff[11]*nn_model4.y_pred
print(final_pred.shape)

In [None]:
np.corrcoef([nn_model1.y_pred,nn_model2.y_pred,nn_model3.y_pred,nn_model4.y_pred,final_pred])#lgb_model1.y_pred,lgb_model2.y_pred,lgb_model3.y_pred,lgb_model4.y_pred
            #,xgb_model1.y_pred,xgb_model2.y_pred,xgb_model3.y_pred,xgb_model4.y_pred])
             #,nn_model1.y_pred,nn_model2.y_pred,nn_model3.y_pred,nn_model4.y_pred,final_pred
            #])

In [None]:
from numba import jit
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """

    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients

        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels=[0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds

        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = sorted(np.random.uniform(0,3,3))
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead', options={
            'maxiter': 5000})

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds

        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels=[0, 1, 2, 3])

    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']
'''
@jit
def qwk(y_true: Union[np.ndarray, list],
        y_pred: Union[np.ndarray, list],
        max_rat: int = 3) -> float:
    y_true_ = np.asarray(y_true, dtype=int)
    y_pred_ = np.asarray(y_pred, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    numerator = 0
    for k in range(y_true_.shape[0]):
        i, j = y_true_[k], y_pred_[k]
        hist1[i] += 1
        hist2[j] += 1
        numerator += (i - j) * (i - j)

    denominator = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            denominator += hist1[i] * hist2[j] * (i - j) * (i - j)

    denominator /= y_true_.shape[0]
    return 1 - numerator / denominator


def calc_metric(y_true: Union[np.ndarray, list],
                y_pred: Union[np.ndarray, list]) -> float:
    return qwk(y_true, y_pred)
'''
def qwk(act,pred,n=4,hist_range=(0,3), weights = None):
    O = confusion_matrix(act,pred,sample_weight = weights)
    O = np.divide(O,np.sum(O)) #Agreement Actual

    W = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            W[i][j] = ((i-j)**2)/((n-1)**2)

    act_hist = np.histogram(act,bins=n,range=hist_range, weights=weights)[0]
    prd_hist = np.histogram(pred,bins=n,range=hist_range, weights= weights)[0]

    E = np.outer(act_hist,prd_hist)
    E = np.divide(E,np.sum(E)) #Agreement Expectation

    num = np.sum(np.multiply(W,O)) #Weighted Agreement Actual
    den = np.sum(np.multiply(W,E)) #Weighted Agreement Expectation

    return 1-np.divide(num,den)

weights = 1/ (reduce_train.groupby('installation_id')['accuracy_group'].transform('count'))

In [None]:
'''
best_score = 0
from functools import partial
import scipy as sp
from sklearn.metrics import confusion_matrix
#Y = np.array([lgb_model1.oof_true,lgb_model2.oof_true,lgb_model3.oof_true,lgb_model4.oof_true
            #,xgb_model1.oof_true,xgb_model2.oof_true,xgb_model3.oof_true,xgb_model4.oof_true]).mean(axis=0).T
Y = np.array([nn_model1.oof_true,nn_model2.oof_true,nn_model3.oof_true,nn_model4.oof_true]).mean(axis=0).T
for i in tqdm(range(100)):
    optR = OptimizedRounder()
    optR.fit(oof_final_pred, Y)
    coefficients = optR.coefficients()
    opt_preds1 = optR.predict(oof_final_pred, coefficients)
    score = qwk(Y, opt_preds1)
    if score > best_score:
        print(score)
        best_score = score
        best_coefficients = coefficients
print(score)
print(best_coefficients)
final_pred = optR.predict(final_pred, best_coefficients)
'''

In [None]:
dist = Counter(reduce_train['accuracy_group'])
for k in dist:
    dist[k] /= len(reduce_train)
reduce_train['accuracy_group'].hist()

acum = 0
bound = {}
for i in range(3):
    acum += dist[i]
    bound[i] = np.percentile(final_pred, acum * 100)
print(bound)

def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3

final_pred = np.array(list(map(classify, final_pred)))
sample_submission['accuracy_group'] = final_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission['accuracy_group'].value_counts(normalize=True)