In [1]:
import os, tqdm, json, pickle, gc, zipfile, itertools, time
import pandas as pd
import numpy as np
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from multiprocessing import Pool
import catboost as cb
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, ParameterGrid, StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
import optuna
from optuna.samplers import TPESampler
from tqdm.contrib.concurrent import process_map  
import seaborn as sns
import matplotlib.pyplot as plt
import shap 
from sklearn.model_selection import KFold
from nancorrmp.nancorrmp import NaNCorrMp
# from pathos.multiprocessing import ProcessingPool as Pool
import multiprocessing as mp

class CatBoostOptimizer():
    
    def __init__(self, scoring_func, const_params, seed, direction, n_trials):
        self.scoring_func = scoring_func        
        self.const_params = const_params
        self.seed = seed
        self.direction = direction
        self.n_trials = n_trials

    def objective(self, trial):
                
        params = {
#         'iterations':trial.suggest_int('iterations', 20, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.25),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 31),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                }
        
        params.update(self.const_params)
                
        model = cb.CatBoostClassifier(**params, random_seed=self.seed)
        model.fit(self.X_tr_c, self.y_tr_c, verbose=0, eval_set=(self.X_ho_c, self.y_ho_c))
        
        y_proba = model.predict_proba(self.X_ho_c)[:, 1]
        
        return self.scoring_func(self.y_ho_c, y_proba)
    
    def fit(self, X_tr, y_tr):
        
        self.cat_features = np.argwhere(X_tr.dtypes.values=='object').flatten()
        
        self.X_tr_c, self.X_ho_c, self.y_tr_c, self.y_ho_c = \
            train_test_split(X_tr, y_tr,
                             test_size = .1,
                             shuffle = True,
                             random_state =self.seed)
        
        sampler = TPESampler(seed=self.seed)
        study = optuna.create_study(direction=self.direction, sampler=sampler)
        study.optimize(self.objective, n_trials=self.n_trials)
        self.best_params = study.best_params
        
        del self.X_tr_c, self.X_ho_c, self.y_tr_c, self.y_ho_c
        gc.collect()
        
        return self
    
    def transform(self):
        return self.best_params 

class CsgoOutcomePredictor():

    def __init__(self):
        pass    
    
    def get_game_collection(self, PATH_TO_DIR):
        
        """
        Описание: коллекционирование респонсов парсера
        Параметры: PATH_TO_DIR - путь до директории с респонсами
        """

        L_FILENAMES = os.listdir(PATH_TO_DIR)
        L_COLLECTION = []
        for fnm in tqdm.tqdm(L_FILENAMES):
            try:
                pth = os.path.join(PATH_TO_DIR, fnm)
                with open(pth, 'r') as f:
                    d_rsp = json.load(f)
                L_COLLECTION.append(d_rsp)
            except:
                pass
        idx_ordered = np.argsort([d_game['id'] for d_game in L_COLLECTION])[::-1]
        L_COLLECTION = np.array(L_COLLECTION)[idx_ordered].tolist()
        return L_COLLECTION

    def add_global_info(self, d_game):

        d = {}

        d['id'] = d_game['id']
        d['match_id'] = d_game['match_id']
        d['match_type'] = d_game['match']['match_type']
        d['number_of_games'] = d_game['match']['number_of_games']
        d['date'] = parser.parse(d_game['begin_at'])
        d['map_id'] = d_game['map']['id']
        d['league_id'] = d_game['match']['league']['id']
        d['serie_id'] = d_game['match']['serie']['id']
        d['tournament_id'] = d_game['match']['tournament']['id']
        d['serie_tier'] = d_game['match']['serie']['tier']

        return d

    def add_profile(self, d_game):
            
        # идентификаторы актуальных карт
        l_map2use = [1, 2, 6, 7, 8, 20, 31]
        # ключи со статистикой игрока
        l_stat_keys = ['adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 
                    'headshots', 'k_d_diff', 'kast', 'kills', 'rating']

        # информация об игре
        d_info = self.add_global_info(d_game)
        
        if d_info['map_id'] in l_map2use:  

            d_r1 = d_game['rounds'][0]
            if d_r1['round']==1:
                
                # информация о раундах
                df_rounds = pd.DataFrame.from_records(d_game['rounds'])
                start_ct_id =d_r1['ct']   
                winner_id = df_rounds['winner_team'].value_counts().idxmax()
                maxround = df_rounds['round'].max()
                d_h1_win_count = df_rounds.query('round<=15')['winner_team'].value_counts().to_dict()
                d_h2_win_count = df_rounds.query('round>15')['winner_team'].value_counts().to_dict()
                d_h1_outcome_count = df_rounds.query('round<=15')['outcome'].value_counts().to_dict()
                d_h2_outcome_count = df_rounds.query('round>15')['outcome'].value_counts().to_dict()        

                L = []
                counter = 0
                # информация об игроках
                for p in d_game['players']:
                    counter+=1

                    d = {}
                    d.update(d_info)

                    # идентификатор игрока
                    d['player_id'] = p['player']['id']
                    # идентификатор команды
                    d['team_id'] = p['team']['id']
                    # идентификатор оппонента
                    d['opponent_id'] = p['opponent']['id']

                    # национальность игрока
                    d['player_nationality']  = p['player']['nationality']
                    # дата рождения игрока
                    d['player_birthday']  = p['player']['birthday']
                    # страна команды
                    d['team_location']  = p['team']['location']

                    # сторона начала
                    d['start_ct']= 1 if start_ct_id==d['team_id'] else 0
                    # победа
                    d['win'] = 1 if winner_id==d['team_id'] else 0
                    # все раундов в игре
                    d['maxround'] = maxround

                    # число выигранных раундов в 1-ой половине игры
                    try:
                        d['h1_win_count'] = d_h1_win_count[d['team_id']]
                    except:
                        d['h1_win_count'] = 0 
                    # число выигранных раундов во 2-ой половине игры
                    try:
                        d['h2_win_count'] = d_h2_win_count[d['team_id']]
                    except:
                        d['h2_win_count'] = 0 
                    # исходы раундов в 1-ой половине игры
                    for k, v in d_h1_outcome_count.items():
                        d[f'h1_outcome_{k}_count'] = v
                    # исходы раундов во 2-ой половине игры
                    for k, v in d_h2_outcome_count.items():
                        d[f'h2_outcome_{k}_count'] = v            

                    # статистика игрока
                    d.update({k:p[k] if pd.notnull(p[k]) else 0.0 for k in l_stat_keys})
                    d.update({f'{k}_per_round':p[k]/maxround if pd.notnull(p[k]) else 0.0 for k in l_stat_keys})

                    L.append(d)
                if counter==10:
                    return L
                else:
                    return None
            else:
                return None

    def get_profiles(self, L_COLLECTION):

        """
        Описание: профайлинг игроков в играх
        Параметры: L_COLLECTION- коллекция респонсов
        """        
            
        # информация об игре
        L_GLOBAL_KEYS = [
            'id', 'match_id', 'match_type', 'number_of_games',
            'date', 'year', 'month', 'day', 'weekday', 'hour',
            'map_id',
            'league_id', 'serie_id', 'tournament_id', 'serie_tier',
            'start_ct'
        ]
        # ключи для агрегирования
        L_AGG_KEYS = [    
            
            'h1_outcome_defused_count', 'h1_outcome_eliminated_count',
            'h1_outcome_exploded_count', 'h1_outcome_timeout_count',
            'h1_win_count', 'h2_outcome_defused_count',
            'h2_outcome_eliminated_count', 'h2_outcome_exploded_count',
            'h2_outcome_timeout_count', 'h2_win_count',

            'win', 'maxround',

            'adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
            'k_d_diff', 'kast', 'kills', 'rating', 
            'adr_per_round', 'assists_per_round', 'deaths_per_round', 'first_kills_diff_per_round', 'flash_assists_per_round', 'headshots_per_round',
            'k_d_diff_per_round', 'kast_per_round', 'kills_per_round','rating_per_round'
        ]
        # ключи для группировки
        L_GROUP_KEYS = [
            'team_id', 'opponent_id', 'team_location', 'lineup'
        ]

        # профайлинг игрока
        L_player_profile = []
        for d_game in tqdm.tqdm(L_COLLECTION):
            try:
                L_player_profile.extend(self.add_profile(d_game))        
            except:
                pass
        df_player_profile = pd.DataFrame.from_records(L_player_profile)
        del L_player_profile
        gc.collect()

        L_dict = []
        for (game_id, team_id), subdf in tqdm.tqdm(df_player_profile.groupby(['id', 'team_id'])):
            n_players = subdf.shape[0]
            if n_players==5:
                subdf_c = subdf.copy()
                lineup = '-'.join(subdf['player_id'].sort_values().astype(str))
                subdf_c['lineup'] = lineup
                L_dict.extend(subdf_c.to_dict('records'))
        del df_player_profile
        gc.collect()
        df_player_profile = pd.DataFrame.from_records(L_dict).sort_values('date')
        del L_dict
        gc.collect()

        date = df_player_profile['date']
        df_player_profile['year'] = date.dt.year
        df_player_profile['month'] = date.dt.month
        df_player_profile['day'] = date.dt.day
        df_player_profile['weekday'] = date.dt.weekday
        df_player_profile['hour'] = date.dt.hour
        df_player_profile[['serie_tier', 'team_location']] = df_player_profile[['serie_tier', 'team_location']].fillna('default')    

        # профайлинг команды
        L_team_profile = []
        for (game_id, team_id), subdf in tqdm.tqdm(df_player_profile.groupby(['id', 'team_id'])):    
            d = subdf[L_GLOBAL_KEYS+L_GROUP_KEYS].iloc[0].to_dict()    
            d.update(subdf[L_AGG_KEYS].mean().to_dict())
            L_team_profile.append(d)
        df_team_profile = pd.DataFrame.from_records(L_team_profile)
        del L_team_profile
        gc.collect()

        return {'player':df_player_profile, 'team':df_team_profile}

    def add_info4game(self, game_id):  

        L_GAMEINFO_KEYS = [
            'id',
            'number_of_games',
            'year','month', 'day', 'weekday', 'hour',
            'map_id',
            'league_id', 'serie_id', 'tournament_id', 
            'serie_tier'
        ]
        
        df_game = self.df_team_profile.query('id==@game_id')

        d_fs4gm = {}
        d_fs4gm.update(df_game[L_GAMEINFO_KEYS].iloc[0].to_dict())

        d_team_id2start_ct = dict(zip(df_game['team_id'], df_game['start_ct']))
        d_team_id2opponent_id = dict(zip(df_game['team_id'], df_game['opponent_id']))
        d_team_id2lineup = dict(zip(df_game['team_id'], df_game['lineup']))
        d_team_id2loc = dict(zip(df_game['team_id'], df_game['team_location']))

        df_game = self.df_player_profile.query('id==@game_id')

        for team_id, subdf in df_game.groupby('team_id'):

            prefix = 'start_ct' if d_team_id2start_ct[team_id]==1 else 'start_t'

            d_fs4gm[f'{prefix}__team_id'] = team_id    
            d_fs4gm[f'{prefix}__team_lineup'] = d_team_id2lineup[team_id]
            d_fs4gm[f'{prefix}__team_location'] = d_team_id2loc[team_id]
            
            subdf = subdf.sort_values('player_id')    
            L_p_id = subdf['player_id'].values    
            d_player_id2nat = dict(zip(subdf['player_id'], subdf['player_nationality']))
            ser_bd = subdf['player_birthday'].astype('datetime64')
            ser_bd_y = ser_bd.dt.year
            ser_bd_m = ser_bd.dt.month
            ser_bd_d = ser_bd.dt.day

            for i, p_id in enumerate(L_p_id):
                d_fs4gm[f'{prefix}__player{i+1}_id'] = p_id
                d_fs4gm[f'{prefix}__player{i+1}_nationality'] = d_player_id2nat[p_id]
                d_fs4gm[f'{prefix}__player{i+1}_birthday_year'] = ser_bd_y.iloc[i]
                d_fs4gm[f'{prefix}__player{i+1}_birthday_month'] = ser_bd_m.iloc[i]
                d_fs4gm[f'{prefix}__player{i+1}_birthday_day'] = ser_bd_d.iloc[i]  
        return d_fs4gm

    def add_features__gameinfo(self, PATH_TO_GAMEINFO_FEATURES):        

        ls = os.listdir(PATH_TO_GAMEINFO_FEATURES)
        L_GAME_IDXS = np.unique(self.df_team_profile['id'])
        try:
            set_in = set([int(x.split('.')[0]) for x in ls])
        except:
            set_in = set()
        set_all = set(L_GAME_IDXS)
        set_new = set_all-set_in
        L_GAME_IDXS = list(set_new)[::-1]

        for game_id in tqdm.tqdm(L_GAME_IDXS):
            try:    
                d_fs4gm = self.add_info4game(game_id)
                pth = os.path.join(PATH_TO_GAMEINFO_FEATURES, '{}.pickle'.format(game_id))
                with open(pth, 'wb') as f:
                    pickle.dump(d_fs4gm, f)
                del d_fs4gm
            except:
                pass

    def add_features__team4game(self, game_id):  

        L_GROUP_KEYS = [        
            'number_of_games',
            'year','month', 'day', 'weekday', 'hour',
            'serie_tier'
        ]
        L_FILTER_KEYS = [
            'league_id', 'serie_id', 'tournament_id'
        ]

        # ключи для агрегирования
        L_AGG_KEYS = [  

            'maxround', 'win', 
            
            'h1_outcome_defused_count', 'h1_outcome_eliminated_count',
            'h1_outcome_exploded_count', 'h1_outcome_timeout_count',
            'h1_win_count', 'h2_outcome_defused_count',
            'h2_outcome_eliminated_count', 'h2_outcome_exploded_count',
            'h2_outcome_timeout_count', 'h2_win_count',    

            'adr', 'first_kills_diff', 'k_d_diff', 'kast','rating', 
            'assists_per_round', 'deaths_per_round',
            'flash_assists_per_round', 'headshots_per_round',
            'kills_per_round'
            
        ]

        L_BY_KEYS = [
            'number_of_games',
            'year','month', 'day', 'weekday', 'hour',
            'serie_tier'
        ]
        
        df_game = self.df_team_profile.query('id==@game_id')

        date = df_game['date'].iloc[0]
        map_id = df_game['map_id'].iloc[0]
        league_id = df_game['league_id'].iloc[0]
        serie_id = df_game['serie_id'].iloc[0]
        tournament_id = df_game['tournament_id'].iloc[0]
        d_filter = dict(zip(['league_id', 'serie_id', 'tournament_id'], [league_id, serie_id, tournament_id]))

        d_fs4gm = {'id':game_id}    

        d_team_id2start_ct = dict(zip(df_game['team_id'], df_game['start_ct']))
        d_team_id2opponent_id = dict(zip(df_game['team_id'], df_game['opponent_id']))
        d_team_id2lineup = dict(zip(df_game['team_id'], df_game['lineup']))
        d_team_id2loc = dict(zip(df_game['team_id'], df_game['team_location']))
        

        for team_id, start_ct in d_team_id2start_ct.items():

            opponent_id = d_team_id2opponent_id[team_id]
            lineup = d_team_id2lineup[team_id]

            prefix = 'start_ct' if start_ct==1 else 'start_t'

            df_history = self.df_team_profile.query('(date<@date)&(team_id==@team_id)')
            df_history_on_map_with_start = df_history.query('(map_id==@map_id)&(start_ct==@start_ct)')        
            df_history_with_lineup = df_history.query('lineup==@lineup')
            df_history_on_map_with_start_and_lineup = df_history.query('(map_id==@map_id)&(start_ct==@start_ct)&(lineup==@lineup)')
            df_history_pair = df_history.query('opponent_id==@opponent_id')
            df_history_on_map_with_start_and_pair = df_history.query('(map_id==@map_id)&(start_ct==@start_ct)&(opponent_id==@opponent_id)')

            L_DF = [
                df_history, df_history_on_map_with_start, 
                df_history_with_lineup, df_history_on_map_with_start_and_lineup,
                df_history_pair, df_history_on_map_with_start_and_pair
            ]
            L_SUFFIX = [
                'all_map_all_start', 'current_map_current_start', 
                'all_map_all_start__lineup', 'current_map_current_start__lineup',
                'all_map_all_start__pair', 'current_map_current_start__pair',
            ]

            for filter_key, filter_value in d_filter.items():
                for suffix, df in zip(['all_map_all_start', 'current_map_current_start'],
                                    [df_history, df_history_on_map_with_start, ]):
                    L_SUFFIX.append(filter_key)
                    L_DF.append(df[df[filter_key]==filter_value])

            d_dicts4team = dict(zip(L_SUFFIX, L_DF))
            del L_SUFFIX, L_DF

            
            for suffix, subdf in d_dicts4team.items():                       
                for key in L_AGG_KEYS:
                    values = subdf[key].values
                    d_fs4gm[f'{prefix}__team__{suffix}__{key}__mean'] = np.mean(values)
                    d_fs4gm[f'{prefix}__team__{suffix}__{key}__sum'] = np.sum(values)
                    for by_key in L_BY_KEYS:
                        for by_value, subsubdf in subdf.groupby(by_key):
                            values = subsubdf[key].values
                            try:
                                d_fs4gm[f'{prefix}__team__{suffix}__{by_key}_{int(by_value)}__{key}__mean'] = np.mean(values)
                                d_fs4gm[f'{prefix}__team__{suffix}__{by_key}_{int(by_value)}__{key}__sum'] = np.sum(values)
                            except:
                                d_fs4gm[f'{prefix}__team__{suffix}__{by_key}_{by_value}__{key}__mean'] = np.mean(values)
                                d_fs4gm[f'{prefix}__team__{suffix}__{by_key}_{by_value}__{key}__sum'] = np.sum(values)
            del d_dicts4team

        return d_fs4gm   

    def add_features__team(self, PATH_TO_FEATURES_TEAM): 
        ls = os.listdir(PATH_TO_FEATURES_TEAM)
        L_GAME_IDXS = np.unique(self.df_team_profile['id'])
        try:
            set_in = set([int(x.split('.')[0]) for x in ls])
        except:
            set_in = set()
        set_all = set(L_GAME_IDXS)
        set_new = set_all-set_in
        L_GAME_IDXS = list(set_new)[::-1]

        for game_id in tqdm.tqdm(L_GAME_IDXS):
            try:    
                d_fs4gm = self.add_features__team4game(game_id)
                pth = os.path.join(PATH_TO_FEATURES_TEAM, '{}.pickle'.format(game_id))
                with open(pth, 'wb') as f:
                    pickle.dump(d_fs4gm, f)
                del d_fs4gm
            except:
                pass   

    def add_features__player4game(self, game_id):  

        L_GROUP_KEYS = [        
            'number_of_games',
            'year','month', 'day', 'weekday', 'hour',
            'serie_tier'
        ]
        L_FILTER_KEYS = [
            'league_id', 'serie_id', 'tournament_id'
        ]

        # ключи для агрегирования
        L_AGG_KEYS = [  
            
            'adr', 'first_kills_diff', 'k_d_diff', 'kast', 'rating', 
            'assists_per_round', 'deaths_per_round',
            'flash_assists_per_round', 'headshots_per_round',
            'kills_per_round'
            
        ]

        L_BY_KEYS = [            
            'year','month', 'day', 'weekday', 'hour'
            
        ]
        
        df_game = self.df_player_profile.query('id==@game_id')

        date = df_game['date'].iloc[0]
        map_id = df_game['map_id'].iloc[0]
        league_id = df_game['league_id'].iloc[0]
        serie_id = df_game['serie_id'].iloc[0]
        tournament_id = df_game['tournament_id'].iloc[0]
        d_filter = dict(zip(['league_id', 'serie_id', 'tournament_id'], [league_id, serie_id, tournament_id]))

        d_fs4gm = {'id':game_id}    

        d_team_id2start_ct = dict(zip(df_game['team_id'], df_game['start_ct']))   

        for team_id, start_ct in d_team_id2start_ct.items():        

            prefix = 'start_ct' if start_ct==1 else 'start_t'

            L_p_id = np.unique(df_game.query('team_id==@team_id')['player_id'])

            for i, p_id in enumerate(L_p_id):

                df_in_team_history = self.df_player_profile.query('(date<@date)&(player_id==@p_id)&(team_id==@team_id)')
                df_in_team_history_on_map_with_start = df_in_team_history.query('(map_id==@map_id)&(start_ct==@start_ct)')  
                df_not_in_team_history = self.df_player_profile.query('(date<@date)&(player_id==@p_id)&(team_id!=@team_id)')
                df_not_in_team_history_on_map_with_start = df_in_team_history.query('(map_id==@map_id)&(start_ct==@start_ct)') 

                L_DF = [
                    df_in_team_history, df_in_team_history_on_map_with_start, 
                    df_not_in_team_history, df_not_in_team_history_on_map_with_start                
                ]
                L_SUFFIX = [
                    f'player{i+1}__in_team__all_map_all_start', f'player{i+1}__in_team__current_map_current_start', 
                    f'player{i+1}__not_in_team__all_map_all_start', f'player{i+1}__not_in_team__current_map_current_start', 
                ]        

                d_dicts4player = dict(zip(L_SUFFIX, L_DF))
                del L_SUFFIX, L_DF

            
                for suffix, subdf in d_dicts4player.items():                       
                    for key in L_AGG_KEYS:
                        values = subdf[key].values
                        d_fs4gm[f'{prefix}__{suffix}__{key}__mean'] = np.mean(values)
                        d_fs4gm[f'{prefix}__{suffix}__{key}__sum'] = np.sum(values)
                        for by_key in L_BY_KEYS:
                            for by_value, subsubdf in subdf.groupby(by_key):
                                values = subsubdf[key].values
                                try:
                                    d_fs4gm[f'{prefix}__{suffix}__{by_key}_{int(by_value)}__{key}__mean'] = np.mean(values)
                                    d_fs4gm[f'{prefix}__{suffix}__{by_key}_{int(by_value)}__{key}__sum'] = np.sum(values)
                                except:
                                    d_fs4gm[f'{prefix}__{suffix}__{by_key}_{by_value}__{key}__mean'] = np.mean(values)
                                    d_fs4gm[f'{prefix}__{suffix}__{by_key}_{by_value}__{key}__sum'] = np.sum(values)
                del d_dicts4player

        return d_fs4gm

    def add_features__player(self, PATH_TO_FEATURES_PLAYER): 
        ls = os.listdir(PATH_TO_FEATURES_PLAYER)
        L_GAME_IDXS = np.unique(self.df_player_profile['id'])
        try:
            set_in = set([int(x.split('.')[0]) for x in ls])
        except:
            set_in = set()
        set_all = set(L_GAME_IDXS)
        set_new = set_all-set_in
        L_GAME_IDXS = list(set_new)[::-1]

        for game_id in tqdm.tqdm(L_GAME_IDXS):
            try:    
                d_fs4gm = self.add_features__player4game(game_id)
                pth = os.path.join(PATH_TO_FEATURES_PLAYER, '{}.pickle'.format(game_id))
                with open(pth, 'wb') as f:
                    pickle.dump(d_fs4gm, f)
                del d_fs4gm
            except:
                pass  

    def reduce_mem_usage(self, series):
        try:
            col_type = series.dtype

            if col_type != object:
                c_min = series.min()
                c_max = series.max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        series = series.astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        series = series.astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        series = series.astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        series = series.astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        series = series.astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        series = series.astype(np.float32)
                    else:
                        series = series.astype(np.float64)
            else:
                pass 
        except:
            pass
        
        return series 

    def build_features(self, PATH_TO_FEATURES_GAMEINFO, PATH_TO_FEATURES_TEAM, PATH_TO_FEATURES_PLAYER):

        """
        Сборка признаков
        """   

        # все файлы с признаками
        set_gameinfo= set(os.listdir(PATH_TO_FEATURES_GAMEINFO))
        set_team= set(os.listdir(PATH_TO_FEATURES_TEAM))
        set_player= set(os.listdir(PATH_TO_FEATURES_PLAYER))
        l_all_files = np.array(list(set.intersection(*[set_gameinfo, set_team, set_player])))
        l_all_files = l_all_files[np.argsort([int(x.split('.')[0]) for x in l_all_files])]

        # размер батча
        batch_size = 100
        n = np.int32(np.ceil(len(l_all_files) / batch_size))
        l_batches = np.array_split(l_all_files, n)

        # сборка
        df_features = pd.DataFrame()
        for batch in tqdm.tqdm(l_batches):
            
            l = []
            for fnm in batch:
                D = {}
                for pth2dir in [PATH_TO_FEATURES_GAMEINFO, PATH_TO_FEATURES_TEAM, PATH_TO_FEATURES_PLAYER]:
                    pth = os.path.join(pth2dir, fnm)
                    with open(pth, 'rb') as f:
                        d = pickle.load(f)
                    D.update(d)
                    del d
                l.append(D)
                del D
            
            df = pd.DataFrame.from_records(l).apply(self.reduce_mem_usage)
            del l
            df_features = df_features.append(df)
            del df
        
        return df_features

    def build_targets(self, L_GAME_IDXS):
    
        """
        Сборка челевых переменных (победа, тотал м/б, число выигранных раундов в 1/2 половинах за обе стороны)
        """    
        df_targets = pd.DataFrame()
        for d_rsp in tqdm.tqdm(self.L_COLLECTION):  

            try:
                
                game_id = d_rsp['id']
                if game_id in L_GAME_IDXS:
                    ###########################################################################    
                    df_rounds = pd.DataFrame.from_records(d_rsp['rounds'])

                    maxround = df_rounds['round'].max()
                    start_ct_id = df_rounds.query('round==1')['ct'].iloc[0]
                    start_t_id = df_rounds.query('round==1')['terrorists'].iloc[0]
                    df_h1 = df_rounds.query('round<=15')
                    df_h2 = df_rounds.query('round>15')
                    d_h1_win_count = df_h1['winner_team'].value_counts().to_dict()
                    d_h2_win_count = df_h2['winner_team'].value_counts().to_dict()
                    d_h1h2_win_count = df_rounds['winner_team'].value_counts().to_dict()
                    winner_id = df_rounds['winner_team'].value_counts().idxmax()
                    

                    #############################################################################

                    d_targets4game = {'id':game_id}
                    
                    d_targets4game['start_ct__win'] = int(winner_id==start_ct_id)

                    for i in range(16, 31):

                        d_targets4game[f'total__b__{i}'] = int(maxround>=i)
                        d_targets4game[f'total__m__{i}'] = int(maxround<=i)

                    for i in range(1, 16):

                        d_targets4game[f'h1__start_ct_win__b__{i}'] = int(d_h1_win_count[start_ct_id]>=i)
                        d_targets4game[f'h1__start_ct_win__m__{i}'] = int(d_h1_win_count[start_ct_id]<=i)    
                        d_targets4game[f'h1__start_t_win__b__{i}'] = int(d_h1_win_count[start_t_id]>=i)
                        d_targets4game[f'h1__start_t_win__m__{i}'] = int(d_h1_win_count[start_t_id]<=i)

                        d_targets4game[f'h2__start_ct_win__b__{i}'] = int(d_h2_win_count[start_ct_id]>=i)
                        d_targets4game[f'h2__start_ct_win__m__{i}'] = int(d_h2_win_count[start_ct_id]<=i)    
                        d_targets4game[f'h2__start_t_win__b__{i}'] = int(d_h1_win_count[start_t_id]>=i)
                        d_targets4game[f'h2__start_t_win__m__{i}'] = int(d_h1_win_count[start_t_id]<=i)

                        d_targets4game[f'h1h2__start_ct_win__b__{i}'] = int(d_h1h2_win_count[start_ct_id]>=i)
                        d_targets4game[f'h1h2__start_ct_win__m__{i}'] = int(d_h1h2_win_count[start_ct_id]<=i)
                        d_targets4game[f'h1h2__start_t_win__b__{i}'] = int(d_h1h2_win_count[start_t_id]>=i)
                        d_targets4game[f'h1h2__start_t_win__m__{i}'] = int(d_h1h2_win_count[start_t_id]<=i)                     

                    df_targets = df_targets.append(d_targets4game, ignore_index = True)

            except:
                pass 
        df_targets['id'] = df_targets['id'].astype(int)
        
        return df_targets

    def prepare_data(self, df_targets, df_features):

        df_targets = df_targets.set_index('id').astype(int)
        df_features = df_features.set_index('id')
        games2use= np.intersect1d(df_features.index, df_targets.index)

        X = df_features.loc[games2use]
        del df_features
        gc.collect()
        L_CAT_FEATURES = [
            'number_of_games', 'year', 'month', 'day', 'weekday', 'hour',
            'map_id', 'league_id', 'serie_id', 'tournament_id', 'serie_tier',
            'start_t__team_id', 'start_t__team_lineup', 'start_t__team_location',
            'start_t__player1_id', 'start_t__player1_nationality',
            'start_t__player1_birthday_year', 'start_t__player1_birthday_month',
            'start_t__player1_birthday_day', 'start_t__player2_id',
            'start_t__player2_nationality', 'start_t__player2_birthday_year',
            'start_t__player2_birthday_month', 'start_t__player2_birthday_day',
            'start_t__player3_id', 'start_t__player3_nationality',
            'start_t__player3_birthday_year', 'start_t__player3_birthday_month',
            'start_t__player3_birthday_day', 'start_t__player4_id',
            'start_t__player4_nationality', 'start_t__player4_birthday_year',
            'start_t__player4_birthday_month', 'start_t__player4_birthday_day',
            'start_t__player5_id', 'start_t__player5_nationality',
            'start_t__player5_birthday_year', 'start_t__player5_birthday_month',
            'start_t__player5_birthday_day', 'start_ct__team_id',
            'start_ct__team_lineup', 'start_ct__team_location',
            'start_ct__player1_id', 'start_ct__player1_nationality',
            'start_ct__player1_birthday_year', 'start_ct__player1_birthday_month',
            'start_ct__player1_birthday_day', 'start_ct__player2_id',
            'start_ct__player2_nationality', 'start_ct__player2_birthday_year',
            'start_ct__player2_birthday_month', 'start_ct__player2_birthday_day',
            'start_ct__player3_id', 'start_ct__player3_nationality',
            'start_ct__player3_birthday_year', 'start_ct__player3_birthday_month',
            'start_ct__player3_birthday_day', 'start_ct__player4_id',
            'start_ct__player4_nationality', 'start_ct__player4_birthday_year',
            'start_ct__player4_birthday_month', 'start_ct__player4_birthday_day',
            'start_ct__player5_id', 'start_ct__player5_nationality',
            'start_ct__player5_birthday_year', 'start_ct__player5_birthday_month',
            'start_ct__player5_birthday_day'
        ]

        for key in L_CAT_FEATURES:
            try:
                X[key] = X[key].fillna(-9999).astype(int).astype('category')
            except:
                X[key] = X[key].fillna('default').astype('category')

        L_NUM_FEATURES = X.drop(L_CAT_FEATURES, 1).columns
        X[L_NUM_FEATURES] = X[L_NUM_FEATURES].fillna(-9999)
        
        Y = df_targets.loc[games2use]
        del df_targets
        gc.collect()    

        X_obj = X.select_dtypes('category').astype('object')
        L_obj_keys = X_obj.columns
        for cmb in itertools.combinations(L_obj_keys, 2):
            cmb= list(cmb)
            new_key = '-'.join([str(x) for x in cmb])    
            X[new_key] = X_obj[cmb].astype('str').apply(lambda x: '-'.join(x), axis = 1).astype('category')
        del X_obj
        gc.collect()

        return X, Y

    def run_ml_pipeline(self, X, y):

        # итерации бустинга
        CONST_PARAMS= {
            'iterations':1000,
            'loss_function':'Logloss',    
            'verbose':1,
        }
        # сид рандома
        SEED = 13
        # доля тестовой части
        TEST_SIZE= .05
        # доля отложенной части
        HOLD_SIZE = .2
        # размер батча 
        BATCH_RATE = .2

        # сплит
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = TEST_SIZE, shuffle = False)
        X_tr, X_ho, y_tr, y_ho = train_test_split(X_tr, y_tr, test_size = HOLD_SIZE, shuffle = False)

        # бъем признаки на батчи, чтобы не перегрузить оперативку
        L_all_keys = X_tr.columns
        L_batches = np.array_split(L_all_keys, np.int32(np.ceil(len(L_all_keys)/5000)))
        
        # отбираем признаки с ненулевой важностью
        L_feat2use = []
        for i, batch in enumerate(L_batches):
            print('> batch#{}/{}'.format(i+1, len(L_batches)))
            x_tr_batch = X_tr[batch]
            x_ho_batch = X_ho[batch]
            params = CONST_PARAMS.copy()
            params['cat_features'] = np.where(x_tr_batch.dtypes=='category')[0]
            model = cb.CatBoostClassifier(**params)        
            model.fit(x_tr_batch, y_tr, eval_set=(x_ho_batch, y_ho), early_stopping_rounds=50)
            mask = model.feature_importances_>0
            L_feat2use.extend(batch[mask].tolist())
            del x_tr_batch, x_ho_batch
        X_tr_c, X_ho_c, X_te_c = X_tr[L_feat2use], X_ho[L_feat2use], X_te[L_feat2use]
        del X_tr, X_ho, X_te
        X_tr, X_ho, X_te = X_tr_c, X_ho_c, X_te_c
        del X_tr_c, X_ho_c, X_te_c
        gc.collect()

        # рекурсивный отбор с ранней остановкой
        i = 1
        while True:
            print('> iter#{}'.format(i))
            params = CONST_PARAMS.copy()
            params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
            model = cb.CatBoostClassifier(**params)        
            model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)
            mask = model.feature_importances_>0
            if np.all(mask):
                break
            else:
                X_tr, X_ho = X_tr.loc[:, mask], X_ho.loc[:, mask]
                i+=1
        X_te = X_te[X_tr.columns]
        # оптимизация гиперпараметров
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        params['iterations'] = model.best_iteration_
        params['verbose'] =0
        model = cb.CatBoostClassifier(**params)    

        cb_opt = CatBoostOptimizer(
                        scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                        const_params=params,
                        seed=SEED, 
                        direction='maximize',
                        n_trials=15
            )

        cb_opt.fit(X_tr, y_tr)
        best_params = cb_opt.transform()
        best_params['verbose'] = 0
        best_params['random_state'] = SEED
        

        i = 1
        while True:

            print('> permutation importance iter#{}. n_features = {}'.format(i, X_tr.shape[1]))

            params = best_params.copy()        
            params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
            model = cb.CatBoostClassifier(**params)
            model.fit(X_tr, y_tr)
            params['iterations'] = model.best_iteration_
            te_score_before=roc_auc_score(y_te, model.predict_proba(X_te)[:, 1])

            L_perm_imp = []
            for j in tqdm.tqdm(range(100)):
                d_perm_imp = permutation_importance(model, X_ho, y_ho, scoring='roc_auc', n_repeats=1, random_state = SEED+j, n_jobs=-1)
                L_perm_imp.append(d_perm_imp['importances_mean'].flatten())
            arr_perm_imp_mean = np.mean(np.r_[L_perm_imp], 0)
            idx_selected = np.where(arr_perm_imp_mean>0)[0]

            params_c=params.copy()
            params_c['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
            model = cb.CatBoostClassifier(**params_c)
            model.fit(X_tr.iloc[:, idx_selected], y_tr)
            params_c['iterations'] = model.best_iteration_
            te_score_after = roc_auc_score(y_te, model.predict_proba(X_te.iloc[:, idx_selected])[:, 1])

            print('\t> score before: {:.2f}, score after: {:.2f}'.format(te_score_before, te_score_after))
            if te_score_after > te_score_before:
                best_score = te_score_after
                X_tr, X_ho, X_te = X_tr.iloc[:, idx_selected], X_ho.iloc[:, idx_selected], X_te.iloc[:, idx_selected]
                i+=1
            else:
                break

        # оптимизация гиперпараметров
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        params['iterations'] = model.best_iteration_
        params['verbose'] =0
        model = cb.CatBoostClassifier(**params)    

        cb_opt = CatBoostOptimizer(
                        scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                        const_params=params,
                        seed=SEED, 
                        direction='maximize',
                        n_trials=30
            )
        cb_opt.fit(X_tr, y_tr)
        best_params = cb_opt.transform()
        best_params['verbose'] = 0   
        best_params['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
        best_params['random_state'] = SEED 
        
        features = X_tr.columns
        d_res = {'params':best_params, 'roc_auc':best_score, 'features':features}

        X_tr, X_ho, X_te, y_tr, y_ho, y_te
        gc.collect()

        return d_res

    def fit(self, PATH_TO_RESPONSES, PATH_TO_FEATURES_GAMEINFO, PATH_TO_FEATURES_TEAM, PATH_TO_FEATURES_PLAYER):

        time.sleep(1)
        print('> collecting responses ...')
        # коллекция респонсов
        self.L_COLLECTION = self.get_game_collection(PATH_TO_RESPONSES)
        print('----------------------------------------------------------------------------------\n')


        time.sleep(1)
        print('> preparing team/player profiles ...')
        # профайлинг игроков и команд в играх
        d_profile = self.get_profiles(self.L_COLLECTION)
        self.df_player_profile, self.df_team_profile = d_profile['player'], d_profile['team']
        del d_profile
        gc.collect()
        print('----------------------------------------------------------------------------------\n')


        time.sleep(1)
        print('> collecting features: 1. game info ...')
        self.add_features__gameinfo(PATH_TO_FEATURES_GAMEINFO)

        time.sleep(1)
        print('> collecting features: 2. team history aggregation ...')
        self.add_features__team(PATH_TO_FEATURES_TEAM)

        time.sleep(1)
        print('> collecting features: 3. player history aggregation ...')
        self.add_features__player(PATH_TO_FEATURES_PLAYER)
        print('----------------------------------------------------------------------------------\n')


        time.sleep(1)
        print('> building features ...')
        # df_features = self.build_features(PATH_TO_FEATURES_GAMEINFO, PATH_TO_FEATURES_TEAM, PATH_TO_FEATURES_PLAYER)
        df_features = pd.read_pickle('df_features.pickle')

        time.sleep(1)
        print('> building targets ...')
        L_GAME_IDXS = np.unique(df_features['id'])
        # df_targets = self.build_targets(L_GAME_IDXS) 
        df_targets = pd.read_pickle('df_targets.pickle')
        print('----------------------------------------------------------------------------------\n')

        time.sleep(1)
        print('> preparing dataset for ml ...')
        X, Y = self.prepare_data(df_targets, df_features)
        del df_targets, df_features
        gc.collect()
        print('----------------------------------------------------------------------------------\n')

        time.sleep(1)
        print('> running pipelines ...')
        L_ALL_TARGET_KEYS = Y.columns
        self.D_RESULTS = {}        
        # выполянем пайплайн для целевых переменных
        for i, target_key in enumerate(L_ALL_TARGET_KEYS):
            print('\t> iter#{}/{}. target: {} ...'.format(i+1, len(L_ALL_TARGET_KEYS), target_key))
            y = Y[target_key].astype(int)
            d_result = self.run_ml_pipeline(X, y)
            self.D_RESULTS[target_key] = d_result
            del d_result, y
            print('----------------------------------------------------------------------------------\n')

        return self

In [2]:
# директория с коллекцией респонсов
PATH_TO_RESPONSES = 'L_games_collection'
# директория с коллекцией признаков для игр (информация об игре)
PATH_TO_FEATURES_GAMEINFO = r'D:\\features_gameinfo'
# директория с коллекцией признаков для игр (командная статистика)
PATH_TO_FEATURES_TEAM = r'D:\\features_team'
# директория с коллекцией признаков для игр (статистика игроков)
PATH_TO_FEATURES_PLAYER = r'D:\\features_player'

# модель
csgo_ml = CsgoOutcomePredictor()
csgo_ml.fit(PATH_TO_RESPONSES, PATH_TO_FEATURES_GAMEINFO, PATH_TO_FEATURES_TEAM, PATH_TO_FEATURES_PLAYER)

  0%|          | 165/57761 [00:00<00:35, 1638.50it/s]

> collecting responses ...


100%|██████████| 57761/57761 [00:33<00:00, 1708.50it/s]


----------------------------------------------------------------------------------



  0%|          | 78/57760 [00:00<01:17, 745.03it/s]

> preparing team/player profiles ...


100%|██████████| 57760/57760 [04:38<00:00, 207.68it/s]
100%|██████████| 69846/69846 [03:39<00:00, 317.82it/s]
100%|██████████| 69846/69846 [01:48<00:00, 644.19it/s]


----------------------------------------------------------------------------------



0it [00:00, ?it/s]

> collecting features: 1. game info ...



0it [00:00, ?it/s]

> collecting features: 2. team history aggregation ...



0it [00:00, ?it/s]

> collecting features: 3. player history aggregation ...
----------------------------------------------------------------------------------






> building features ...
> building targets ...
----------------------------------------------------------------------------------

> preparing dataset for ml ...
----------------------------------------------------------------------------------

> running pipeline ...
	> iter#1/211. target: start_ct__win ...
> batch#1/35
Learning rate set to 0.029149
0:	learn: 0.6905720	test: 0.6918694	best: 0.6918694 (0)	total: 349ms	remaining: 5m 49s
1:	learn: 0.6855515	test: 0.6920366	best: 0.6918694 (0)	total: 568ms	remaining: 4m 43s
2:	learn: 0.6832337	test: 0.6920430	best: 0.6918694 (0)	total: 818ms	remaining: 4m 31s
3:	learn: 0.6798154	test: 0.6920855	best: 0.6918694 (0)	total: 1.06s	remaining: 4m 23s
4:	learn: 0.6757836	test: 0.6923134	best: 0.6918694 (0)	total: 1.31s	remaining: 4m 20s
5:	learn: 0.6721005	test: 0.6921238	best: 0.6918694 (0)	total: 1.53s	remaining: 4m 13s
6:	learn: 0.6699597	test: 0.6925614	best: 0.6918694 (0)	total: 1.77s	remaining: 4m 10s
7:	learn: 0.6673831	test: 0.6914858	be