# ПРЕДСКАЗАНИЕ ИСХОДОВ ТУРНИРНОЙ ИГРЫ ПО КС:ГО

In [1]:
import os, tqdm, json, pickle, gc, zipfile, itertools
import pandas as pd
import numpy as np
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from multiprocessing import Pool
import catboost as cb
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, ParameterGrid, StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
import optuna
from optuna.samplers import TPESampler
from tqdm.contrib.concurrent import process_map  

def run_ml_pipeline(X, y):

    # сплит
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = TEST_SIZE, shuffle = False)
    X_tr, X_ho, y_tr, y_ho = train_test_split(X_tr, y_tr, test_size = HOLD_SIZE, shuffle = False)

    # бъем признаки на батчи, чтобы не перегрузить оперативку
    L_all_keys = X_tr.columns
    L_batches = np.array_split(L_all_keys, np.int32(np.ceil(len(L_all_keys)/25000)))
    
    # отбираем признаки с ненулевой важностью
    L_feat2use = []
    for i, batch in enumerate(L_batches):
        print('> batch#{}/{}'.format(i+1, len(L_batches)))
        x_tr_batch = X_tr[batch]
        x_ho_batch = X_ho[batch]
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(x_tr_batch.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)        
        model.fit(x_tr_batch, y_tr, eval_set=(x_ho_batch, y_ho), early_stopping_rounds=50)
        mask = model.feature_importances_>0
        L_feat2use.extend(batch[mask].tolist())
        del x_tr_batch, x_ho_batch
    X_tr_c, X_ho_c, X_te_c = X_tr[L_feat2use], X_ho[L_feat2use], X_te[L_feat2use]
    del X_tr, X_ho, X_te
    X_tr, X_ho, X_te = X_tr_c, X_ho_c, X_te_c
    del X_tr_c, X_ho_c, X_te_c
    gc.collect()

    # рекурсивный отбор с ранней остановкой
    i = 1
    while True:
        print('> iter#{}'.format(i))
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)        
        model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)
        mask = model.feature_importances_>0
        if np.all(mask):
            break
        else:
            X_tr, X_ho = X_tr.loc[:, mask], X_ho.loc[:, mask]
            i+=1
    X_te = X_te[X_tr.columns]
    # оптимизация гиперпараметров
    params = CONST_PARAMS.copy()
    params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
    params['iterations'] = model.best_iteration_
    params['verbose'] =0
    model = cb.CatBoostClassifier(**params)    

    cb_opt = CatBoostOptimizer(
                    scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                    const_params=params,
                    seed=SEED, 
                    direction='maximize',
                    n_trials=20
        )

    cb_opt.fit(X_tr, y_tr)
    best_params = cb_opt.transform()
    best_params['verbose'] = 0
    best_params['random_state'] = SEED
    

    i = 1
    while True:

        print('> permutation importance iter#{}. n_features = {}'.format(i, X_tr.shape[1]))

        params = best_params.copy()        
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)
        model.fit(X_tr, y_tr)
        te_score_before=roc_auc_score(y_te, model.predict_proba(X_te)[:, 1])

        L_perm_imp = []
        for j in tqdm.tqdm(range(100)):
            d_perm_imp = permutation_importance(model, X_ho, y_ho, scoring='roc_auc', n_repeats=1, random_state = SEED+j, n_jobs=-1)
            L_perm_imp.append(d_perm_imp['importances_mean'].flatten())
        arr_perm_imp_mean = np.mean(np.r_[L_perm_imp], 0)
        idx_selected = np.where(arr_perm_imp_mean>0)[0]

        params_c=params.copy()
        params_c['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params_c)
        model.fit(X_tr.iloc[:, idx_selected], y_tr)
        te_score_after = roc_auc_score(y_te, model.predict_proba(X_te.iloc[:, idx_selected])[:, 1])

        print('\t> score before: {:.2f}, score after: {:.2f}'.format(te_score_before, te_score_after))
        if te_score_after > te_score_before:
            best_score = te_score_after
            X_tr, X_ho, X_te = X_tr.iloc[:, idx_selected], X_ho.iloc[:, idx_selected], X_te.iloc[:, idx_selected]
            i+=1
        else:
            break

    # оптимизация гиперпараметров
    params = CONST_PARAMS.copy()
    params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
    params['iterations'] = model.best_iteration_
    params['verbose'] =0
    model = cb.CatBoostClassifier(**params)    

    cb_opt = CatBoostOptimizer(
                    scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                    const_params=params,
                    seed=SEED, 
                    direction='maximize',
                    n_trials=30
        )
    cb_opt.fit(X_tr, y_tr)
    best_params = cb_opt.transform()
    best_params['verbose'] = 0   
    best_params['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
    best_params['random_state'] = SEED 
    
    features = X_tr.columns
    d_res = {'params':best_params, 'roc_auc':best_score, 'features':features}

    X_tr, X_ho, X_te, y_tr, y_ho, y_te
    gc.collect()

    return d_res

def get_game_collection(PATH_TO_DIR):
    
    """
    Описание: коллекционирование респонсов парсера
    Параметры: PATH_TO_DIR - путь до директории с респонсами
    """

    L_FILENAMES = os.listdir(PATH_TO_DIR)
    L_COLLECTION = []
    for fnm in tqdm.tqdm(L_FILENAMES):
        try:
            pth = os.path.join(PATH_TO_DIR, fnm)
            with open(pth, 'r') as f:
                d_rsp = json.load(f)
            L_COLLECTION.append(d_rsp)
        except:
            pass
    idx_ordered = np.argsort([d_game['id'] for d_game in L_COLLECTION])[::-1]
    L_COLLECTION = np.array(L_COLLECTION)[idx_ordered].tolist()
    return L_COLLECTION

def get_profiles(L_COLLECTION):

    """
    Описание: профайлинг игроков в играх
    Параметры: 
    """

    def add_profile(d_rsp):

        def add_global_info(d_game):

            d = {}

            d['id'] = d_game['id']
            d['match_id'] = d_game['match_id']
            d['match_type'] = d_game['match']['match_type']
            d['number_of_games'] = d_game['match']['number_of_games']
            d['date'] = parser.parse(d_game['begin_at'])
            d['map_id'] = d_game['map']['id']
            d['league_id'] = d_game['match']['league']['id']
            d['serie_id'] = d_game['match']['serie']['id']
            d['tournament_id'] = d_game['match']['tournament']['id']
            d['serie_tier'] = d_game['match']['serie']['tier']

            return d
        
        # идентификаторы актуальных карт
        l_map2use = [1, 2, 6, 7, 8, 20, 31]
        # ключи со статистикой игрока
        l_stat_keys = ['adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 
                       'headshots', 'k_d_diff', 'kast', 'kills', 'rating']

        # информация об игре
        d_info = add_global_info(d_game)
        
        if d_info['map_id'] in l_map2use:  

            d_r1 = d_rsp['rounds'][0]
            if d_r1['round']==1:
                
                # информация о раундах
                df_rounds = pd.DataFrame.from_records(d_rsp['rounds'])
                start_ct_id =d_r1['ct']   
                winner_id = df_rounds['winner_team'].value_counts().idxmax()
                maxround = df_rounds['round'].max()
                d_h1_win_count = df_rounds.query('round<=15')['winner_team'].value_counts().to_dict()
                d_h2_win_count = df_rounds.query('round>15')['winner_team'].value_counts().to_dict()
                d_h1_outcome_count = df_rounds.query('round<=15')['outcome'].value_counts().to_dict()
                d_h2_outcome_count = df_rounds.query('round>15')['outcome'].value_counts().to_dict()        

                L = []
                counter = 0
                # информация об игроках
                for p in d_rsp['players']:
                    counter+=1

                    d = {}
                    d.update(d_info)

                    # идентификатор игрока
                    d['player_id'] = p['player']['id']
                    # идентификатор команды
                    d['team_id'] = p['team']['id']
                    # идентификатор оппонента
                    d['opponent_id'] = p['opponent']['id']

                    # национальность игрока
                    d['player_nationality']  = p['player']['nationality']
                    # дата рождения игрока
                    d['player_birthday']  = p['player']['birthday']
                    # страна команды
                    d['team_location']  = p['team']['location']

                    # сторона начала
                    d['start_ct']= 1 if start_ct_id==d['team_id'] else 0
                    # победа
                    d['win'] = 1 if winner_id==d['team_id'] else 0
                    # все раундов в игре
                    d['maxround'] = maxround

                    # число выигранных раундов в 1-ой половине игры
                    try:
                        d['h1_win_count'] = d_h1_win_count[d['team_id']]
                    except:
                        d['h1_win_count'] = 0 
                    # число выигранных раундов во 2-ой половине игры
                    try:
                        d['h2_win_count'] = d_h2_win_count[d['team_id']]
                    except:
                        d['h2_win_count'] = 0 
                    # исходы раундов в 1-ой половине игры
                    for k, v in d_h1_outcome_count.items():
                        d[f'h1_outcome_{k}_count'] = v
                    # исходы раундов во 2-ой половине игры
                    for k, v in d_h2_outcome_count.items():
                        d[f'h2_outcome_{k}_count'] = v            

                    # статистика игрока
                    d.update({k:p[k] if pd.notnull(p[k]) else 0.0 for k in l_stat_keys})

                    L.append(d)
                if counter==10:
                    return L
                else:
                    return None
            else:
                return None
    # информация об игре
    L_GLOBAL_KEYS = [
        'id', 'match_id', 'match_type', 'number_of_games',
        'date', 'year', 'month', 'day', 'weekday', 'hour',
        'map_id',
        'league_id', 'serie_id', 'tournament_id', 'serie_tier',
        'start_ct'
    ]

    # ключи для агрегирования
    L_AGG_KEYS = [    
        
        'h1_outcome_defused_count', 'h1_outcome_eliminated_count',
        'h1_outcome_exploded_count', 'h1_outcome_timeout_count',
        'h1_win_count', 'h2_outcome_defused_count',
        'h2_outcome_eliminated_count', 'h2_outcome_exploded_count',
        'h2_outcome_timeout_count', 'h2_win_count',

        'adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
        'k_d_diff', 'kast', 'kills', 'maxround', 'rating', 'win'
    ]

    # ключи для группировки
    L_GROUP_KEYS = [
        'team_id', 'opponent_id', 'team_location', 'lineup'
    ]
    # профайлинг игрока
    L_player_profile = []
    for d_game in tqdm.tqdm(L_COLLECTION):
        try:
            L_player_profile.extend(add_profile(d_game))        
        except:
            pass
    df_player_profile = pd.DataFrame.from_records(L_player_profile)
    del L_player_profile
    gc.collect()

    L_dict = []
    for (game_id, team_id), subdf in tqdm.tqdm(df_player_profile.groupby(['id', 'team_id'])):
        n_players = subdf.shape[0]
        if n_players==5:
            subdf_c = subdf.copy()
            lineup = '-'.join(subdf['player_id'].sort_values().astype(str))
            subdf_c['lineup'] = lineup
            L_dict.extend(subdf_c.to_dict('records'))
    del df_player_profile
    gc.collect()
    df_player_profile = pd.DataFrame.from_records(L_dict).sort_values('date')
    del L_dict
    gc.collect()

    date = df_player_profile['date']
    df_player_profile['year'] = date.dt.year
    df_player_profile['month'] = date.dt.month
    df_player_profile['day'] = date.dt.day
    df_player_profile['weekday'] = date.dt.weekday
    df_player_profile['hour'] = date.dt.hour
    df_player_profile[['serie_tier', 'team_location']] = df_player_profile[['serie_tier', 'team_location']].fillna('default')    

    # профайлинг команды
    L_team_profile = []
    for (game_id, team_id), subdf in tqdm.tqdm(df_player_profile.groupby(['id', 'team_id'])):    
        d = subdf[L_GLOBAL_KEYS+L_GROUP_KEYS].iloc[0].to_dict()    
        d.update(subdf[L_AGG_KEYS].mean().to_dict())
        L_team_profile.append(d)
    df_team_profile = pd.DataFrame.from_records(L_team_profile)
    del L_team_profile
    gc.collect()

    return {'player':df_player_profile, 'team':df_team_profile}

class CatBoostOptimizer():
    
    def __init__(self, scoring_func, const_params, seed, direction, n_trials):
        self.scoring_func = scoring_func        
        self.const_params = const_params
        self.seed = seed
        self.direction = direction
        self.n_trials = n_trials

    def objective(self, trial):
                
        params = {
#         'iterations':trial.suggest_int('iterations', 20, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.0025, 0.25),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 31),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                }
        
        params.update(self.const_params)
                
        model = cb.CatBoostClassifier(**params, random_seed=self.seed)
        model.fit(self.X_tr_c, self.y_tr_c, verbose=0, eval_set=(self.X_ho_c, self.y_ho_c))
        
        y_proba = model.predict_proba(self.X_ho_c)[:, 1]
        
        return self.scoring_func(self.y_ho_c, y_proba)
    
    def fit(self, X_tr, y_tr):
        
        self.cat_features = np.argwhere(X_tr.dtypes.values=='object').flatten()
        
        self.X_tr_c, self.X_ho_c, self.y_tr_c, self.y_ho_c = \
            train_test_split(X_tr, y_tr,
                             test_size = .1,
                             shuffle = True,
                             random_state =self.seed)
        
        sampler = TPESampler(seed=self.seed)
        study = optuna.create_study(direction=self.direction, sampler=sampler)
        study.optimize(self.objective, n_trials=self.n_trials)
        self.best_params = study.best_params
        
        del self.X_tr_c, self.X_ho_c, self.y_tr_c, self.y_ho_c
        gc.collect()
        
        return self
    
    def transform(self):
        return self.best_params  

def get_targets(L_COLLECTION, L_GAME_IDXS):
    
    """
    Извлечение челевых переменных (победа, тотал м/б, число выигранных раундов в 1/2 половинах за обе стороны)
    """    
    df_targets = pd.DataFrame()
    for d_rsp in tqdm.tqdm_notebook(L_COLLECTION):  

        try:
            
            game_id = d_rsp['id']
            if game_id in L_GAME_IDXS:
                ###########################################################################    
                df_rounds = pd.DataFrame.from_records(d_rsp['rounds'])

                maxround = df_rounds['round'].max()
                start_ct_id = df_rounds.query('round==1')['ct'].iloc[0]
                start_t_id = df_rounds.query('round==1')['terrorists'].iloc[0]
                df_h1 = df_rounds.query('round<=15')
                df_h2 = df_rounds.query('round>15')
                d_h1_win_count = df_h1['winner_team'].value_counts().to_dict()
                d_h2_win_count = df_h2['winner_team'].value_counts().to_dict()
                d_h1h2_win_count = df_rounds['winner_team'].value_counts().to_dict()
                winner_id = df_rounds['winner_team'].value_counts().idxmax()
                

                #############################################################################

                d_targets4game = {'id':game_id}
                
                d_targets4game['start_ct__win'] = int(winner_id==start_ct_id)

                for i in range(16, 31):

                    d_targets4game[f'total__b__{i}'] = int(maxround>=i)
                    d_targets4game[f'total__m__{i}'] = int(maxround<=i)

                for i in range(1, 16):

                    d_targets4game[f'h1__start_ct_win__b__{i}'] = int(d_h1_win_count[start_ct_id]>=i)
                    d_targets4game[f'h1__start_ct_win__m__{i}'] = int(d_h1_win_count[start_ct_id]<=i)    
                    d_targets4game[f'h1__start_t_win__b__{i}'] = int(d_h1_win_count[start_t_id]>=i)
                    d_targets4game[f'h1__start_t_win__m__{i}'] = int(d_h1_win_count[start_t_id]<=i)

                    d_targets4game[f'h2__start_ct_win__b__{i}'] = int(d_h2_win_count[start_ct_id]>=i)
                    d_targets4game[f'h2__start_ct_win__m__{i}'] = int(d_h2_win_count[start_ct_id]<=i)    
                    d_targets4game[f'h2__start_t_win__b__{i}'] = int(d_h1_win_count[start_t_id]>=i)
                    d_targets4game[f'h2__start_t_win__m__{i}'] = int(d_h1_win_count[start_t_id]<=i)

                    d_targets4game[f'h1h2__start_ct_win__b__{i}'] = int(d_h1h2_win_count[start_ct_id]>=i)
                    d_targets4game[f'h1h2__start_ct_win__m__{i}'] = int(d_h1h2_win_count[start_ct_id]<=i)
                    d_targets4game[f'h1h2__start_t_win__b__{i}'] = int(d_h1h2_win_count[start_t_id]>=i)
                    d_targets4game[f'h1h2__start_t_win__m__{i}'] = int(d_h1h2_win_count[start_t_id]<=i) 

                df_targets = df_targets.append(d_targets4game, ignore_index = True)

        except:
            pass 
    df_targets['id'] = df_targets['id'].astype(int)
        
    return df_targets

def get_features(df_player_profile, L_GAME_IDXS):

    """
    Описание: извлекает признаки для игры на основе профайлинга истории игр команд/игроков
    Параметры: game_id - идентификатор игры
    На выходе: словарь с признаками
               ключи признаков формируются в соответствии со схемой:
                    префикс: {сторона начала в текущей игре}__{признак команды/признак игрока}__{фильтр+группировка}___{тип агрегирования} 
                    типы фильтров: 
                        1. вся история игры команды
                        2. итория игр команды в лиге, серии, турнире, карте и тд из текущей игры
                        2. история игр команды на карте за сторону начала
                        3. история игр команды с составом из текущей игры
                        4. история игр пары команд
                        5. история игр игрока в команде
                        6. история игр игрока вне команды
                    типы группировок:
                        1. год, месяц, день, день недели, час
                        2. тир серии
                        3. число игр в матче (1, 3, 5)

    """


    def add_features(df_history, L_by_key, d_filter, L_agg_key, prefix):  
        if d_filter is None:      
            if L_by_key is None:    
                d = {}
                for agg_key in L_agg_key:
                    ser = df_history[agg_key]
                    d.update({f'{prefix}__{agg_key}__mean' : ser.mean(),
                                f'{prefix}__{agg_key}__sum' : ser.sum()})
                return d
            else:
                d = {}
                for by_key in L_by_key:
                    for by_value, subdf in df_history.groupby(by_key):
                        for agg_key in L_agg_key:
                            ser = subdf[agg_key]
                            d.update({f'{prefix}__{by_key}_{by_value}__{agg_key}__mean' : ser.mean(),
                                        f'{prefix}__{by_key}_{by_value}__{agg_key}__sum' : ser.sum()})
                return d
        else:
            if L_by_key is None:    
                d = {}            
                for f_k, f_v in d_filter.items():
                    df_hist=df_history[df_history[f_k]==f_v]
                    for agg_key in L_agg_key:  
                        ser = df_hist[agg_key]
                        d.update({f'{prefix}__filter_{f_k}__{agg_key}__mean' : ser.mean(),
                                    f'{prefix}__filter_{f_k}__{agg_key}__sum' : ser.sum()})
                return d
            else:
                d = {}
                for by_key in L_by_key:
                    for by_value, subdf in df_history.groupby(by_key):
                        for f_k, f_v in d_filter.items():
                            sbdf = subdf[subdf[f_k]==f_v]
                            for agg_key in L_agg_key:
                                ser = sbdf[agg_key]
                                d.update({f'{prefix}___filter_{f_k}__{by_key}_{by_value}__{agg_key}__mean' : ser.mean(),
                                            f'{prefix}___filter_{f_k}__{by_key}_{by_value}__{agg_key}__sum' : ser.sum()})
                return d

    L_GLOBAL_KEYS= [
        'id', 'match_id', 'match_type', 'number_of_games', 'date', 'year',
        'month', 'day', 'weekday', 'hour', 'map_id', 'league_id', 'serie_id',
        'tournament_id', 'serie_tier'
    ]

    L_FILTER_KEYS= [
        'year','month', 'day', 'weekday', 'hour', 
        'league_id', 'serie_id', 'tournament_id', 'serie_tier'
    ]

    L_AGG_KEYS = [
        'win', 'maxround', 'h1_win_count',
        'h2_win_count', 'h1_outcome_eliminated_count',
        'h1_outcome_defused_count', 'h1_outcome_timeout_count',
        'h1_outcome_eliminated_count', 'h2_outcome_timeout_count',
        'h2_outcome_defused_count', 'h2_outcome_exploded_count', 'adr',
        'assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
        'k_d_diff', 'kast', 'kills', 'rating', 'h1_outcome_exploded_count'
    ]

    L_AGG_KEYS_V2 = [
        'adr',
        'assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
        'k_d_diff', 'kast', 'kills', 'rating'
    ]

    L_GROUP_KEYS= ['year','month', 'day','weekday','hour', 'serie_tier','number_of_games']

    L_GROUP_KEYS_V2= ['year','month', 'day','weekday','hour']

    L_BATCHES = np.array_split(L_GAME_IDXS, np.int32(np.ceil(len(L_GAME_IDXS)/10)))

    df_features = pd.DataFrame()
    for batch in tqdm.tqdm_notebook(L_BATCHES):

        L_features = []
        for game_id in tqdm.tqdm_notebook(batch):

            df_game = df_team_profile.query('id==@game_id')

            d_fs4gm = df_game[L_GLOBAL_KEYS].iloc[0].to_dict()

            date = df_game['date'].iloc[0]
            map_id = df_game['map_id'].iloc[0]
            d_filter = {
                'league_id':df_game['league_id'].iloc[0],
                'serie_id':df_game['serie_id'].iloc[0],
                'tournament_id':df_game['tournament_id'].iloc[0]
            }
            
            d_team_id2start_ct = dict(zip(df_game['team_id'],df_game['start_ct']))
            d_team_id2opponent_id = dict(zip(df_game['team_id'],df_game['opponent_id']))
            d_team_id2lineup = dict(zip(df_game['team_id'],df_game['lineup']))
            d_team_id2loc = dict(zip(df_game['team_id'], df_game['team_location']))    

            for team_id, start_ct in d_team_id2start_ct.items():

                PREFIX = 'START_CT' if start_ct==1 else 'START_T'
                
                subdf_player_profile = df_player_profile.query('(id==@game_id)&(team_id==@team_id)')
                L_ps = subdf_player_profile['player_id'].sort_values().values
                d_player_id2nat = dict(zip(subdf_player_profile['player_id'], subdf_player_profile['player_nationality']))
                d_player_id2birthday = dict(zip(subdf_player_profile['player_id'], subdf_player_profile['player_birthday']))

                d_team_id2loc = dict(zip(df_game['team_id'], df_game['team_location']))
                opponent_id = d_team_id2opponent_id[team_id]
                lineup = d_team_id2lineup[team_id]

                d_fs4gm[f'{PREFIX}__team_id'] = team_id
                d_fs4gm[f'{PREFIX}__team_location'] = d_team_id2loc[team_id]
                d_fs4gm[f'{PREFIX}__lineup'] = lineup

                
                df_history4team = df_team_profile.query('(date<@date)&(team_id==@team_id)')
                df_history4team_with_map_and_start = df_history4team.query('(map_id==@map_id)&(start_ct==@start_ct)')
                df_history4team_with_lineup = df_history4team.query('(lineup==@lineup)')
                df_history4team_with_map_and_start_and_lineup = df_history4team_with_map_and_start.query('(lineup==@lineup)')
                df_history4team_with_opponent = df_history4team.query('(opponent_id==@opponent_id)')
                df_history4team_with_map_and_start_and_opponent = df_history4team_with_map_and_start.query('(opponent_id==@opponent_id)')
                L_dataframes = [
                    df_history4team, df_history4team_with_map_and_start, df_history4team_with_lineup,
                    df_history4team_with_map_and_start_and_lineup, df_history4team_with_opponent,
                    df_history4team_with_map_and_start_and_opponent
                ]
                del df_history4team, df_history4team_with_map_and_start, df_history4team_with_lineup,\
                    df_history4team_with_map_and_start_and_lineup, df_history4team_with_opponent,\
                    df_history4team_with_map_and_start_and_opponent
                L_prefix = [f'{PREFIX}__team__all_map_all_start', f'{PREFIX}__team__current_map_current_start',
                            f'{PREFIX}__team__all_map_all_start_with_lineup', f'{PREFIX}__team__current_map_current_start_with_lineup',
                            f'{PREFIX}__team__all_map_all_start__with_pair', f'{PREFIX}__team__current_map_current_start_with_pair']        
                for prefix, df_history in zip(L_prefix, L_dataframes):        
                    d_fs4gm.update(add_features(df_history, L_by_key=None, d_filter = None,L_agg_key=L_AGG_KEYS, prefix= prefix))
                    d_fs4gm.update(add_features(df_history, L_by_key=L_GROUP_KEYS, d_filter = None, L_agg_key=L_AGG_KEYS, prefix= prefix))  
                    d_fs4gm.update(add_features(df_history, L_by_key=None, d_filter = d_filter, L_agg_key=L_AGG_KEYS, prefix= prefix))   
                del L_dataframes

                for i, player_id in enumerate(L_ps):
                    
                    d_fs4gm[f'{PREFIX}__player{i+1}_id'] = player_id
                    d_fs4gm[f'{PREFIX}_player{i+1}_nationality'] = d_player_id2nat[player_id]
                    d_fs4gm[f'{PREFIX}_player{i+1}_birthday'] = d_player_id2birthday[player_id]

                    df_history4player = df_player_profile.query('(date<@date)&(player_id==@player_id)')
                    df_history4player_in_team = df_history4player.query('team_id==@team_id')
                    df_history4player_not_in_team =  df_history4player.query('team_id!=@team_id')
                    del df_history4player

                    d_fs4gm.update(add_features(df_history4player_in_team, L_by_key=None, d_filter = None,L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_in_team'))
                    d_fs4gm.update(add_features(df_history4player_not_in_team, L_by_key=None, d_filter = None, L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_not_in_team')) 
                    d_fs4gm.update(add_features(df_history4player_in_team, L_by_key=L_GROUP_KEYS_V2, d_filter = None,L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_in_team'))
                    d_fs4gm.update(add_features(df_history4player_not_in_team, L_by_key=L_GROUP_KEYS_V2, d_filter = None, L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_not_in_team')) 

            L_features.append(d_fs4gm)
            del d_fs4gm, df_history4player_in_team, df_history4player_not_in_team

        df = pd.DataFrame.from_records(L_features).apply(reduce_mem_usage)
        del L_features

        df_features = df_features.append(df)
        del df

        assert df_features['id'].value_counts().nunique()==1
        
    return df_features   

def reduce_mem_usage(series):
    try:
        col_type = series.dtype

        if col_type != object:
            c_min = series.min()
            c_max = series.max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    series = series.astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    series = series.astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    series = series.astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    series = series.astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    series = series.astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    series = series.astype(np.float32)
                else:
                    series = series.astype(np.float64)
        else:
            pass 
    except:
        pass
    
    return series 

def prepare_data(df_targets, df_features):

    df_targets = df_targets.set_index('id').astype(int)
    df_features = df_features.set_index('id').drop(['match_id', 'date', 'match_type'], 1)
    games2use= np.intersect1d(df_features.index, df_targets.index)

    X = df_features.loc[games2use]
    del df_features
    gc.collect()
    Y = df_targets.loc[games2use]
    del df_targets
    gc.collect()


    L_BD_KEYS = [
        'START_CT_player1_birthday', 'START_CT_player2_birthday',
        'START_CT_player3_birthday', 'START_CT_player4_birthday',
        'START_CT_player5_birthday', 'START_T_player1_birthday',
        'START_T_player2_birthday', 'START_T_player3_birthday',
        'START_T_player4_birthday', 'START_T_player5_birthday'
    ]

    X_bd= pd.concat([
        X[L_BD_KEYS].astype('datetime64').apply(lambda x: x.dt.year).fillna(-9999).astype(int).astype('category').add_suffix('_year'),
        X[L_BD_KEYS].astype('datetime64').apply(lambda x: x.dt.month).fillna(-9999).astype(int).astype('category').add_suffix('_month'),
        X[L_BD_KEYS].astype('datetime64').apply(lambda x: x.dt.day).fillna(-9999).astype(int).astype('category').add_suffix('_day')
        ],1)

    X= pd.concat([X.drop(L_BD_KEYS, 1), X_bd], 1)

    L_CAT_FEATURES_V1 = [
        'START_CT__team_id', 'START_T__team_id', 
        'START_CT__player1_id', 'START_CT__player2_id',
        'START_CT__player3_id', 'START_CT__player4_id', 'START_CT__player5_id',
        'START_T__player1_id', 'START_T__player2_id',
        'START_T__player3_id', 'START_T__player4_id', 'START_T__player5_id'
    ]
    X[L_CAT_FEATURES_V1] = X[L_CAT_FEATURES_V1].fillna(-9999).astype(int).astype('category')

    L_CAT_FEATURES_V2 = [
        'number_of_games', 'year', 'month', 'day', 'weekday', 'hour', 'map_id',
        'league_id', 'serie_id', 'tournament_id']
    X[L_CAT_FEATURES_V2] = X[L_CAT_FEATURES_V2].fillna(-9999).astype(int).astype('category')



    L_CAT_FEATURES_V3 = [
        'START_CT__team_location', 'START_T__team_location', 
        'START_CT_player1_nationality', 'START_CT_player2_nationality',
        'START_CT_player3_nationality', 'START_CT_player4_nationality',
        'START_CT_player5_nationality', 'START_T_player1_nationality',
        'START_T_player2_nationality', 'START_T_player3_nationality',
        'START_T_player4_nationality', 'START_T_player5_nationality']
    X[L_CAT_FEATURES_V3] = X[L_CAT_FEATURES_V3].fillna('default').astype('category')
    X['serie_tier'] = X['serie_tier'].fillna('default').astype('category')

    X[['START_CT__lineup', 'START_T__lineup']] = X[['START_CT__lineup', 'START_T__lineup']].fillna('default').astype('category')

    X_obj = X.select_dtypes('category').astype('object')
    L_obj_keys = X_obj.columns
    for cmb in itertools.combinations(L_obj_keys, 2):
        cmb= list(cmb)
        new_key = '-'.join([str(x) for x in cmb])    
        X[new_key] = X_obj[cmb].astype('str').apply(lambda x: '-'.join(x), axis = 1).astype('category')
    del X_obj
    gc.collect()

    return {'features':X, 'targets':Y}

### 1. подготовка данных

In [2]:
# коллекция респонсов
PATH_TO_DIR = 'L_games_collection'
L_COLLECTION = get_game_collection(PATH_TO_DIR)

In [3]:
# профайлинг игроков и команд в играх
d_profile = get_profiles(L_COLLECTION)
df_player_profile, df_team_profile = d_profile['player'], d_profile['team']
with open('d_profile.pickle', 'wb') as f:
    pickle.dump(d_profile, f)

In [4]:
# with open('d_profile.pickle', 'rb') as f:
#     d_profile = pickle.load(f)
# df_player_profile, df_team_profile = d_profile['player'], d_profile['team']

In [5]:
df_player_profile_c = df_player_profile.apply(reduce_mem_usage)
del df_player_profile
df_player_profile = df_player_profile_c
del df_player_profile_c
gc.collect()

df_team_profile_c = df_team_profile.apply(reduce_mem_usage)
del df_team_profile
df_team_profile = df_team_profile_c
del df_team_profile_c
gc.collect()

In [6]:
# игры
L_GAME_IDXS = np.unique(df_player_profile['id'])

# признаки
df_features= get_features(df_player_profile, L_GAME_IDXS[-5000:])
df_features.to_pickle('df_features.pickle')

# целвеыее переменные
df_targets = get_targets(L_COLLECTION, L_GAME_IDXS[-5000:])
df_targets.to_pickle('df_targets.pickle')

In [2]:
df_targets = pd.read_pickle('df_targets.pickle')
df_features = pd.read_pickle('df_features.pickle')

In [3]:
# подготовка датасета 
d_data  = prepare_data(df_targets, df_features)
X, Y = d_data['features'], d_data['targets']
del d_data
gc.collect()
# X.to_pickle('X.pickle'), Y.to_pickle('Y.pickle')

In [2]:
X, Y = pd.read_pickle('X.pickle'), pd.read_pickle('Y.pickle')

### 2. мл пайплайн

In [3]:
# итерации бустинга
CONST_PARAMS= {
    'iterations':1000,
    'loss_function':'Logloss',    
    'verbose':1,
}
SEED = 13
# доля тестовой части
TEST_SIZE= .05
# доля отложенной части
HOLD_SIZE = .2
# ключи целевых переменных
Y = Y.loc[:, ~Y.mean().isin([0, 1])]
L_TARGET_KEYS = Y.columns

In [4]:
D_RUN_RESULTS = {}
for i, target_key in enumerate(L_TARGET_KEYS):
    print('> TARGET#{}/{}. {}'.format(i+1, len(L_TARGET_KEYS),  target_key))    
    y = Y[target_key]
    d_run_result = run_ml_pipeline(X, y)
    D_RUN_RESULTS[target_key] = d_run_result
    del d_run_result, y
    gc.collect()
    with open('D_RUN_RESULTS.pickle', 'wb') as f:
        pickle.dump(D_RUN_RESULTS, f)

> TARGET#1/191. start_ct__win
> batch#1/4
Learning rate set to 0.043453
0:	learn: 0.6911434	test: 0.6926347	best: 0.6926347 (0)	total: 1.61s	remaining: 26m 53s
1:	learn: 0.6889292	test: 0.6912854	best: 0.6912854 (1)	total: 2.79s	remaining: 23m 10s
2:	learn: 0.6861824	test: 0.6901204	best: 0.6901204 (2)	total: 3.94s	remaining: 21m 50s
3:	learn: 0.6841255	test: 0.6887632	best: 0.6887632 (3)	total: 5.09s	remaining: 21m 7s
4:	learn: 0.6814709	test: 0.6874769	best: 0.6874769 (4)	total: 6.43s	remaining: 21m 19s
5:	learn: 0.6797165	test: 0.6866461	best: 0.6866461 (5)	total: 7.63s	remaining: 21m 4s
6:	learn: 0.6773457	test: 0.6853865	best: 0.6853865 (6)	total: 8.79s	remaining: 20m 46s
7:	learn: 0.6750504	test: 0.6847100	best: 0.6847100 (7)	total: 9.82s	remaining: 20m 17s
8:	learn: 0.6727851	test: 0.6832252	best: 0.6832252 (8)	total: 10.9s	remaining: 20m
9:	learn: 0.6709460	test: 0.6823676	best: 0.6823676 (9)	total: 11.9s	remaining: 19m 43s
10:	learn: 0.6695672	test: 0.6817813	best: 0.6817813 (