In [1]:
import os, tqdm, json, pickle, gc, zipfile, itertools, time
import pandas as pd
import numpy as np
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from multiprocessing import Pool
import catboost as cb
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, ParameterGrid, StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
import optuna
from optuna.samplers import TPESampler
from tqdm.contrib.concurrent import process_map  
import seaborn as sns
import matplotlib.pyplot as plt
import shap 
from sklearn.model_selection import KFold
from nancorrmp.nancorrmp import NaNCorrMp

def run_ml_pipeline(X, y):

    # сплит
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = TEST_SIZE, shuffle = False)
    X_tr, X_ho, y_tr, y_ho = train_test_split(X_tr, y_tr, test_size = HOLD_SIZE, shuffle = False)

    # бъем признаки на батчи, чтобы не перегрузить оперативку
    L_all_keys = X_tr.columns
    L_batches = np.array_split(L_all_keys, np.int32(np.ceil(len(L_all_keys)/25000)))
    
    # отбираем признаки с ненулевой важностью
    L_feat2use = []
    for i, batch in enumerate(L_batches):
        print('> batch#{}/{}'.format(i+1, len(L_batches)))
        x_tr_batch = X_tr[batch]
        x_ho_batch = X_ho[batch]
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(x_tr_batch.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)        
        model.fit(x_tr_batch, y_tr, eval_set=(x_ho_batch, y_ho), early_stopping_rounds=50)
        mask = model.feature_importances_>0
        L_feat2use.extend(batch[mask].tolist())
        del x_tr_batch, x_ho_batch
    X_tr_c, X_ho_c, X_te_c = X_tr[L_feat2use], X_ho[L_feat2use], X_te[L_feat2use]
    del X_tr, X_ho, X_te
    X_tr, X_ho, X_te = X_tr_c, X_ho_c, X_te_c
    del X_tr_c, X_ho_c, X_te_c
    gc.collect()

    # рекурсивный отбор с ранней остановкой
    i = 1
    while True:
        print('> iter#{}'.format(i))
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)        
        model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)
        mask = model.feature_importances_>0
        if np.all(mask):
            break
        else:
            X_tr, X_ho = X_tr.loc[:, mask], X_ho.loc[:, mask]
            i+=1
    X_te = X_te[X_tr.columns]
    # оптимизация гиперпараметров
    params = CONST_PARAMS.copy()
    params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
    params['iterations'] = model.best_iteration_
    params['verbose'] =0
    model = cb.CatBoostClassifier(**params)    

    cb_opt = CatBoostOptimizer(
                    scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                    const_params=params,
                    seed=SEED, 
                    direction='maximize',
                    n_trials=15
        )

    cb_opt.fit(X_tr, y_tr)
    best_params = cb_opt.transform()
    best_params['verbose'] = 0
    best_params['random_state'] = SEED
    

    i = 1
    while True:

        print('> permutation importance iter#{}. n_features = {}'.format(i, X_tr.shape[1]))

        params = best_params.copy()        
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)
        model.fit(X_tr, y_tr)
        params['iterations'] = model.best_iteration_
        te_score_before=roc_auc_score(y_te, model.predict_proba(X_te)[:, 1])

        L_perm_imp = []
        for j in tqdm.tqdm(range(100)):
            d_perm_imp = permutation_importance(model, X_ho, y_ho, scoring='roc_auc', n_repeats=1, random_state = SEED+j, n_jobs=-1)
            L_perm_imp.append(d_perm_imp['importances_mean'].flatten())
        arr_perm_imp_mean = np.mean(np.r_[L_perm_imp], 0)
        idx_selected = np.where(arr_perm_imp_mean>0)[0]

        params_c=params.copy()
        params_c['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params_c)
        model.fit(X_tr.iloc[:, idx_selected], y_tr)
        params_c['iterations'] = model.best_iteration_
        te_score_after = roc_auc_score(y_te, model.predict_proba(X_te.iloc[:, idx_selected])[:, 1])

        print('\t> score before: {:.2f}, score after: {:.2f}'.format(te_score_before, te_score_after))
        if te_score_after > te_score_before:
            best_score = te_score_after
            X_tr, X_ho, X_te = X_tr.iloc[:, idx_selected], X_ho.iloc[:, idx_selected], X_te.iloc[:, idx_selected]
            i+=1
        else:
            break

    # оптимизация гиперпараметров
    params = CONST_PARAMS.copy()
    params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
    params['iterations'] = model.best_iteration_
    params['verbose'] =0
    model = cb.CatBoostClassifier(**params)    

    cb_opt = CatBoostOptimizer(
                    scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                    const_params=params,
                    seed=SEED, 
                    direction='maximize',
                    n_trials=30
        )
    cb_opt.fit(X_tr, y_tr)
    best_params = cb_opt.transform()
    best_params['verbose'] = 0   
    best_params['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
    best_params['random_state'] = SEED 
    
    features = X_tr.columns
    d_res = {'params':best_params, 'roc_auc':best_score, 'features':features}

    X_tr, X_ho, X_te, y_tr, y_ho, y_te
    gc.collect()

    return d_res

def get_game_collection(PATH_TO_DIR):
    
    """
    Описание: коллекционирование респонсов парсера
    Параметры: PATH_TO_DIR - путь до директории с респонсами
    """

    L_FILENAMES = os.listdir(PATH_TO_DIR)
    L_COLLECTION = []
    for fnm in tqdm.tqdm(L_FILENAMES):
        try:
            pth = os.path.join(PATH_TO_DIR, fnm)
            with open(pth, 'r') as f:
                d_rsp = json.load(f)
            L_COLLECTION.append(d_rsp)
        except:
            pass
    idx_ordered = np.argsort([d_game['id'] for d_game in L_COLLECTION])[::-1]
    L_COLLECTION = np.array(L_COLLECTION)[idx_ordered].tolist()
    return L_COLLECTION

def get_profiles(L_COLLECTION):

    """
    Описание: профайлинг игроков в играх
    Параметры: L_COLLECTION- коллекция респонсов
    """

    def add_profile(d_rsp):

        def add_global_info(d_game):

            d = {}

            d['id'] = d_game['id']
            d['match_id'] = d_game['match_id']
            d['match_type'] = d_game['match']['match_type']
            d['number_of_games'] = d_game['match']['number_of_games']
            d['date'] = parser.parse(d_game['begin_at'])
            d['map_id'] = d_game['map']['id']
            d['league_id'] = d_game['match']['league']['id']
            d['serie_id'] = d_game['match']['serie']['id']
            d['tournament_id'] = d_game['match']['tournament']['id']
            d['serie_tier'] = d_game['match']['serie']['tier']

            return d
        
        # идентификаторы актуальных карт
        l_map2use = [1, 2, 6, 7, 8, 20, 31]
        # ключи со статистикой игрока
        l_stat_keys = ['adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 
                       'headshots', 'k_d_diff', 'kast', 'kills', 'rating']

        # информация об игре
        d_info = add_global_info(d_game)
        
        if d_info['map_id'] in l_map2use:  

            d_r1 = d_rsp['rounds'][0]
            if d_r1['round']==1:
                
                # информация о раундах
                df_rounds = pd.DataFrame.from_records(d_rsp['rounds'])
                start_ct_id =d_r1['ct']   
                winner_id = df_rounds['winner_team'].value_counts().idxmax()
                maxround = df_rounds['round'].max()
                d_h1_win_count = df_rounds.query('round<=15')['winner_team'].value_counts().to_dict()
                d_h2_win_count = df_rounds.query('round>15')['winner_team'].value_counts().to_dict()
                d_h1_outcome_count = df_rounds.query('round<=15')['outcome'].value_counts().to_dict()
                d_h2_outcome_count = df_rounds.query('round>15')['outcome'].value_counts().to_dict()        

                L = []
                counter = 0
                # информация об игроках
                for p in d_rsp['players']:
                    counter+=1

                    d = {}
                    d.update(d_info)

                    # идентификатор игрока
                    d['player_id'] = p['player']['id']
                    # идентификатор команды
                    d['team_id'] = p['team']['id']
                    # идентификатор оппонента
                    d['opponent_id'] = p['opponent']['id']

                    # национальность игрока
                    d['player_nationality']  = p['player']['nationality']
                    # дата рождения игрока
                    d['player_birthday']  = p['player']['birthday']
                    # страна команды
                    d['team_location']  = p['team']['location']

                    # сторона начала
                    d['start_ct']= 1 if start_ct_id==d['team_id'] else 0
                    # победа
                    d['win'] = 1 if winner_id==d['team_id'] else 0
                    # все раундов в игре
                    d['maxround'] = maxround

                    # число выигранных раундов в 1-ой половине игры
                    try:
                        d['h1_win_count'] = d_h1_win_count[d['team_id']]
                    except:
                        d['h1_win_count'] = 0 
                    # число выигранных раундов во 2-ой половине игры
                    try:
                        d['h2_win_count'] = d_h2_win_count[d['team_id']]
                    except:
                        d['h2_win_count'] = 0 
                    # исходы раундов в 1-ой половине игры
                    for k, v in d_h1_outcome_count.items():
                        d[f'h1_outcome_{k}_count'] = v
                    # исходы раундов во 2-ой половине игры
                    for k, v in d_h2_outcome_count.items():
                        d[f'h2_outcome_{k}_count'] = v            

                    # статистика игрока
                    d.update({k:p[k] if pd.notnull(p[k]) else 0.0 for k in l_stat_keys})

                    L.append(d)
                if counter==10:
                    return L
                else:
                    return None
            else:
                return None
    # информация об игре
    L_GLOBAL_KEYS = [
        'id', 'match_id', 'match_type', 'number_of_games',
        'date', 'year', 'month', 'day', 'weekday', 'hour',
        'map_id',
        'league_id', 'serie_id', 'tournament_id', 'serie_tier',
        'start_ct'
    ]

    # ключи для агрегирования
    L_AGG_KEYS = [    
        
        'h1_outcome_defused_count', 'h1_outcome_eliminated_count',
        'h1_outcome_exploded_count', 'h1_outcome_timeout_count',
        'h1_win_count', 'h2_outcome_defused_count',
        'h2_outcome_eliminated_count', 'h2_outcome_exploded_count',
        'h2_outcome_timeout_count', 'h2_win_count',

        'adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
        'k_d_diff', 'kast', 'kills', 'maxround', 'rating', 'win'
    ]

    # ключи для группировки
    L_GROUP_KEYS = [
        'team_id', 'opponent_id', 'team_location', 'lineup'
    ]
    # профайлинг игрока
    L_player_profile = []
    for d_game in tqdm.tqdm(L_COLLECTION):
        try:
            L_player_profile.extend(add_profile(d_game))        
        except:
            pass
    df_player_profile = pd.DataFrame.from_records(L_player_profile)
    del L_player_profile
    gc.collect()

    L_dict = []
    for (game_id, team_id), subdf in tqdm.tqdm(df_player_profile.groupby(['id', 'team_id'])):
        n_players = subdf.shape[0]
        if n_players==5:
            subdf_c = subdf.copy()
            lineup = '-'.join(subdf['player_id'].sort_values().astype(str))
            subdf_c['lineup'] = lineup
            L_dict.extend(subdf_c.to_dict('records'))
    del df_player_profile
    gc.collect()
    df_player_profile = pd.DataFrame.from_records(L_dict).sort_values('date')
    del L_dict
    gc.collect()

    date = df_player_profile['date']
    df_player_profile['year'] = date.dt.year
    df_player_profile['month'] = date.dt.month
    df_player_profile['day'] = date.dt.day
    df_player_profile['weekday'] = date.dt.weekday
    df_player_profile['hour'] = date.dt.hour
    df_player_profile[['serie_tier', 'team_location']] = df_player_profile[['serie_tier', 'team_location']].fillna('default')    

    # профайлинг команды
    L_team_profile = []
    for (game_id, team_id), subdf in tqdm.tqdm(df_player_profile.groupby(['id', 'team_id'])):    
        d = subdf[L_GLOBAL_KEYS+L_GROUP_KEYS].iloc[0].to_dict()    
        d.update(subdf[L_AGG_KEYS].mean().to_dict())
        L_team_profile.append(d)
    df_team_profile = pd.DataFrame.from_records(L_team_profile)
    del L_team_profile
    gc.collect()

    return {'player':df_player_profile, 'team':df_team_profile}

class CatBoostOptimizer():
    
    def __init__(self, scoring_func, const_params, seed, direction, n_trials):
        self.scoring_func = scoring_func        
        self.const_params = const_params
        self.seed = seed
        self.direction = direction
        self.n_trials = n_trials

    def objective(self, trial):
                
        params = {
#         'iterations':trial.suggest_int('iterations', 20, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.25),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 31),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                }
        
        params.update(self.const_params)
                
        model = cb.CatBoostClassifier(**params, random_seed=self.seed)
        model.fit(self.X_tr_c, self.y_tr_c, verbose=0, eval_set=(self.X_ho_c, self.y_ho_c))
        
        y_proba = model.predict_proba(self.X_ho_c)[:, 1]
        
        return self.scoring_func(self.y_ho_c, y_proba)
    
    def fit(self, X_tr, y_tr):
        
        self.cat_features = np.argwhere(X_tr.dtypes.values=='object').flatten()
        
        self.X_tr_c, self.X_ho_c, self.y_tr_c, self.y_ho_c = \
            train_test_split(X_tr, y_tr,
                             test_size = .1,
                             shuffle = True,
                             random_state =self.seed)
        
        sampler = TPESampler(seed=self.seed)
        study = optuna.create_study(direction=self.direction, sampler=sampler)
        study.optimize(self.objective, n_trials=self.n_trials)
        self.best_params = study.best_params
        
        del self.X_tr_c, self.X_ho_c, self.y_tr_c, self.y_ho_c
        gc.collect()
        
        return self
    
    def transform(self):
        return self.best_params  

def get_targets(L_COLLECTION, L_GAME_IDXS):
    
    """
    Извлечение челевых переменных (победа, тотал м/б, число выигранных раундов в 1/2 половинах за обе стороны)
    """    
    df_targets = pd.DataFrame()
    for d_rsp in tqdm.tqdm(L_COLLECTION):  

        try:
            
            game_id = d_rsp['id']
            if game_id in L_GAME_IDXS:
                ###########################################################################    
                df_rounds = pd.DataFrame.from_records(d_rsp['rounds'])

                maxround = df_rounds['round'].max()
                start_ct_id = df_rounds.query('round==1')['ct'].iloc[0]
                start_t_id = df_rounds.query('round==1')['terrorists'].iloc[0]
                df_h1 = df_rounds.query('round<=15')
                df_h2 = df_rounds.query('round>15')
                d_h1_win_count = df_h1['winner_team'].value_counts().to_dict()
                d_h2_win_count = df_h2['winner_team'].value_counts().to_dict()
                d_h1h2_win_count = df_rounds['winner_team'].value_counts().to_dict()
                winner_id = df_rounds['winner_team'].value_counts().idxmax()
                

                #############################################################################

                d_targets4game = {'id':game_id}
                
                d_targets4game['start_ct__win'] = int(winner_id==start_ct_id)

                for i in range(16, 31):

                    d_targets4game[f'total__b__{i}'] = int(maxround>=i)
                    d_targets4game[f'total__m__{i}'] = int(maxround<=i)

                for i in range(1, 16):

                    d_targets4game[f'h1__start_ct_win__b__{i}'] = int(d_h1_win_count[start_ct_id]>=i)
                    d_targets4game[f'h1__start_ct_win__m__{i}'] = int(d_h1_win_count[start_ct_id]<=i)    
                    d_targets4game[f'h1__start_t_win__b__{i}'] = int(d_h1_win_count[start_t_id]>=i)
                    d_targets4game[f'h1__start_t_win__m__{i}'] = int(d_h1_win_count[start_t_id]<=i)

                    d_targets4game[f'h2__start_ct_win__b__{i}'] = int(d_h2_win_count[start_ct_id]>=i)
                    d_targets4game[f'h2__start_ct_win__m__{i}'] = int(d_h2_win_count[start_ct_id]<=i)    
                    d_targets4game[f'h2__start_t_win__b__{i}'] = int(d_h1_win_count[start_t_id]>=i)
                    d_targets4game[f'h2__start_t_win__m__{i}'] = int(d_h1_win_count[start_t_id]<=i)

                    d_targets4game[f'h1h2__start_ct_win__b__{i}'] = int(d_h1h2_win_count[start_ct_id]>=i)
                    d_targets4game[f'h1h2__start_ct_win__m__{i}'] = int(d_h1h2_win_count[start_ct_id]<=i)
                    d_targets4game[f'h1h2__start_t_win__b__{i}'] = int(d_h1h2_win_count[start_t_id]>=i)
                    d_targets4game[f'h1h2__start_t_win__m__{i}'] = int(d_h1h2_win_count[start_t_id]<=i) 

                df_targets = df_targets.append(d_targets4game, ignore_index = True)

        except:
            pass 
    df_targets['id'] = df_targets['id'].astype(int)
        
    return df_targets

def get_features(df_player_profile, L_GAME_IDXS):

    """
    Описание: извлекает признаки для игры на основе профайлинга истории игр команд/игроков
    Параметры: game_id - идентификатор игры
    На выходе: словарь с признаками
               ключи признаков формируются в соответствии со схемой:
                    префикс: {сторона начала в текущей игре}__{признак команды/признак игрока}__{фильтр+группировка}___{тип агрегирования} 
                    типы фильтров: 
                        1. вся история игры команды
                        2. итория игр команды в лиге, серии, турнире, карте и тд из текущей игры
                        2. история игр команды на карте за сторону начала
                        3. история игр команды с составом из текущей игры
                        4. история игр пары команд
                        5. история игр игрока в команде
                        6. история игр игрока вне команды
                    типы группировок:
                        1. год, месяц, день, день недели, час
                        2. тир серии
                        3. число игр в матче (1, 3, 5)

    """


    def add_features(df_history, L_by_key, d_filter, L_agg_key, prefix):  
        if d_filter is None:      
            if L_by_key is None:    
                d = {}
                for agg_key in L_agg_key:
                    ser = df_history[agg_key]
                    d.update({f'{prefix}__{agg_key}__mean' : ser.mean(),
                                f'{prefix}__{agg_key}__sum' : ser.sum()})
                return d
            else:
                d = {}
                for by_key in L_by_key:
                    for by_value, subdf in df_history.groupby(by_key):
                        for agg_key in L_agg_key:
                            ser = subdf[agg_key]
                            d.update({f'{prefix}__{by_key}_{by_value}__{agg_key}__mean' : ser.mean(),
                                        f'{prefix}__{by_key}_{by_value}__{agg_key}__sum' : ser.sum()})
                return d
        else:
            if L_by_key is None:    
                d = {}            
                for f_k, f_v in d_filter.items():
                    df_hist=df_history[df_history[f_k]==f_v]
                    for agg_key in L_agg_key:  
                        ser = df_hist[agg_key]
                        d.update({f'{prefix}__filter_{f_k}__{agg_key}__mean' : ser.mean(),
                                    f'{prefix}__filter_{f_k}__{agg_key}__sum' : ser.sum()})
                return d
            else:
                d = {}
                for by_key in L_by_key:
                    for by_value, subdf in df_history.groupby(by_key):
                        for f_k, f_v in d_filter.items():
                            sbdf = subdf[subdf[f_k]==f_v]
                            for agg_key in L_agg_key:
                                ser = sbdf[agg_key]
                                d.update({f'{prefix}___filter_{f_k}__{by_key}_{by_value}__{agg_key}__mean' : ser.mean(),
                                            f'{prefix}___filter_{f_k}__{by_key}_{by_value}__{agg_key}__sum' : ser.sum()})
                return d

    L_GLOBAL_KEYS= [
        'id', 'match_id', 'match_type', 'number_of_games', 'date', 'year',
        'month', 'day', 'weekday', 'hour', 'map_id', 'league_id', 'serie_id',
        'tournament_id', 'serie_tier'
    ]

    L_FILTER_KEYS= [
        'year','month', 'day', 'weekday', 'hour', 
        'league_id', 'serie_id', 'tournament_id', 'serie_tier'
    ]

    L_AGG_KEYS = [
        'win', 'maxround', 'h1_win_count',
        'h2_win_count', 'h1_outcome_eliminated_count',
        'h1_outcome_defused_count', 'h1_outcome_timeout_count',
        'h1_outcome_eliminated_count', 'h2_outcome_timeout_count',
        'h2_outcome_defused_count', 'h2_outcome_exploded_count', 'adr',
        'assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
        'k_d_diff', 'kast', 'kills', 'rating', 'h1_outcome_exploded_count'
    ]
    L_AGG_KEYS_V2 = [
        'adr',
        'assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
        'k_d_diff', 'kast', 'kills', 'rating'
    ]
    L_GROUP_KEYS= ['year','month', 'day','weekday','hour', 'serie_tier','number_of_games']
    L_GROUP_KEYS_V2= ['year','month', 'day','weekday','hour']
    
    
    

        
    for game_id in tqdm.tqdm(L_GAME_IDXS):

        df_game = df_team_profile.query('id==@game_id')

        d_fs4gm = df_game[L_GLOBAL_KEYS].iloc[0].to_dict()

        date = df_game['date'].iloc[0]
        map_id = df_game['map_id'].iloc[0]
        d_filter = {
            'league_id':df_game['league_id'].iloc[0],
            'serie_id':df_game['serie_id'].iloc[0],
            'tournament_id':df_game['tournament_id'].iloc[0]
        }
        
        d_team_id2start_ct = dict(zip(df_game['team_id'],df_game['start_ct']))
        d_team_id2opponent_id = dict(zip(df_game['team_id'],df_game['opponent_id']))
        d_team_id2lineup = dict(zip(df_game['team_id'],df_game['lineup']))
        d_team_id2loc = dict(zip(df_game['team_id'], df_game['team_location']))    

        for team_id, start_ct in d_team_id2start_ct.items():

            PREFIX = 'START_CT' if start_ct==1 else 'START_T'
            
            subdf_player_profile = df_player_profile.query('(id==@game_id)&(team_id==@team_id)')
            L_ps = subdf_player_profile['player_id'].sort_values().values
            d_player_id2nat = dict(zip(subdf_player_profile['player_id'], subdf_player_profile['player_nationality']))
            d_player_id2birthday = dict(zip(subdf_player_profile['player_id'], subdf_player_profile['player_birthday']))

            d_team_id2loc = dict(zip(df_game['team_id'], df_game['team_location']))
            opponent_id = d_team_id2opponent_id[team_id]
            lineup = d_team_id2lineup[team_id]

            d_fs4gm[f'{PREFIX}__team_id'] = team_id
            d_fs4gm[f'{PREFIX}__team_location'] = d_team_id2loc[team_id]
            d_fs4gm[f'{PREFIX}__lineup'] = lineup

            
            df_history4team = df_team_profile.query('(date<@date)&(team_id==@team_id)')
            df_history4team_with_map_and_start = df_history4team.query('(map_id==@map_id)&(start_ct==@start_ct)')
            df_history4team_with_lineup = df_history4team.query('(lineup==@lineup)')
            df_history4team_with_map_and_start_and_lineup = df_history4team_with_map_and_start.query('(lineup==@lineup)')
            df_history4team_with_opponent = df_history4team.query('(opponent_id==@opponent_id)')
            df_history4team_with_map_and_start_and_opponent = df_history4team_with_map_and_start.query('(opponent_id==@opponent_id)')
            L_dataframes = [
                df_history4team, df_history4team_with_map_and_start, df_history4team_with_lineup,
                df_history4team_with_map_and_start_and_lineup, df_history4team_with_opponent,
                df_history4team_with_map_and_start_and_opponent
            ]
            del df_history4team, df_history4team_with_map_and_start, df_history4team_with_lineup,\
                df_history4team_with_map_and_start_and_lineup, df_history4team_with_opponent,\
                df_history4team_with_map_and_start_and_opponent
            L_prefix = [f'{PREFIX}__team__all_map_all_start', f'{PREFIX}__team__current_map_current_start',
                        f'{PREFIX}__team__all_map_all_start_with_lineup', f'{PREFIX}__team__current_map_current_start_with_lineup',
                        f'{PREFIX}__team__all_map_all_start__with_pair', f'{PREFIX}__team__current_map_current_start_with_pair']        
            for prefix, df_history in zip(L_prefix, L_dataframes):        
                d_fs4gm.update(add_features(df_history, L_by_key=None, d_filter = None,L_agg_key=L_AGG_KEYS, prefix= prefix))
                d_fs4gm.update(add_features(df_history, L_by_key=L_GROUP_KEYS, d_filter = None, L_agg_key=L_AGG_KEYS, prefix= prefix))  
                d_fs4gm.update(add_features(df_history, L_by_key=None, d_filter = d_filter, L_agg_key=L_AGG_KEYS, prefix= prefix))   
            del L_dataframes

            for i, player_id in enumerate(L_ps):
                
                d_fs4gm[f'{PREFIX}__player{i+1}_id'] = player_id
                d_fs4gm[f'{PREFIX}_player{i+1}_nationality'] = d_player_id2nat[player_id]
                d_fs4gm[f'{PREFIX}_player{i+1}_birthday'] = d_player_id2birthday[player_id]

                df_history4player = df_player_profile.query('(date<@date)&(player_id==@player_id)')
                df_history4player_in_team = df_history4player.query('team_id==@team_id')
                df_history4player_not_in_team =  df_history4player.query('team_id!=@team_id')
                del df_history4player

                d_fs4gm.update(add_features(df_history4player_in_team, L_by_key=None, d_filter = None,L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_in_team'))
                d_fs4gm.update(add_features(df_history4player_not_in_team, L_by_key=None, d_filter = None, L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_not_in_team')) 
                d_fs4gm.update(add_features(df_history4player_in_team, L_by_key=L_GROUP_KEYS_V2, d_filter = None,L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_in_team'))
                d_fs4gm.update(add_features(df_history4player_not_in_team, L_by_key=L_GROUP_KEYS_V2, d_filter = None, L_agg_key=L_AGG_KEYS_V2, prefix= f'{PREFIX}__player{i+1}_not_in_team')) 

        path_out = r'D:\L_features_25042022\{}.pickle'.format(int(game_id))
        with open(path_out, 'wb') as f:
            pickle.dump(d_fs4gm, f)
        del d_fs4gm
        
    return True   

def reduce_mem_usage(series):
    try:
        col_type = series.dtype

        if col_type != object:
            c_min = series.min()
            c_max = series.max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    series = series.astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    series = series.astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    series = series.astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    series = series.astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    series = series.astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    series = series.astype(np.float32)
                else:
                    series = series.astype(np.float64)
        else:
            pass 
    except:
        pass
    
    return series 

def prepare_data(df_targets, df_features):

    df_targets = df_targets.set_index('id').astype(int)
    df_features = df_features.set_index('id').drop(['match_id', 'date', 'match_type'], 1)
    games2use= np.intersect1d(df_features.index, df_targets.index)

    X = df_features.loc[games2use]
    del df_features
    gc.collect()
    Y = df_targets.loc[games2use]
    del df_targets
    gc.collect()


    L_BD_KEYS = [
        'START_CT_player1_birthday', 'START_CT_player2_birthday',
        'START_CT_player3_birthday', 'START_CT_player4_birthday',
        'START_CT_player5_birthday', 'START_T_player1_birthday',
        'START_T_player2_birthday', 'START_T_player3_birthday',
        'START_T_player4_birthday', 'START_T_player5_birthday'
    ]

    X_bd= pd.concat([
        X[L_BD_KEYS].astype('datetime64').apply(lambda x: x.dt.year).fillna(-9999).astype(int).astype('category').add_suffix('_year'),
        X[L_BD_KEYS].astype('datetime64').apply(lambda x: x.dt.month).fillna(-9999).astype(int).astype('category').add_suffix('_month'),
        X[L_BD_KEYS].astype('datetime64').apply(lambda x: x.dt.day).fillna(-9999).astype(int).astype('category').add_suffix('_day')
        ],1)

    X= pd.concat([X.drop(L_BD_KEYS, 1), X_bd], 1)

    L_CAT_FEATURES_V1 = [
        'START_CT__team_id', 'START_T__team_id', 
        'START_CT__player1_id', 'START_CT__player2_id',
        'START_CT__player3_id', 'START_CT__player4_id', 'START_CT__player5_id',
        'START_T__player1_id', 'START_T__player2_id',
        'START_T__player3_id', 'START_T__player4_id', 'START_T__player5_id'
    ]
    X[L_CAT_FEATURES_V1] = X[L_CAT_FEATURES_V1].fillna(-9999).astype(int).astype('category')

    L_CAT_FEATURES_V2 = [
        'number_of_games', 'year', 'month', 'day', 'weekday', 'hour', 'map_id',
        'league_id', 'serie_id', 'tournament_id']
    X[L_CAT_FEATURES_V2] = X[L_CAT_FEATURES_V2].fillna(-9999).astype(int).astype('category')



    L_CAT_FEATURES_V3 = [
        'START_CT__team_location', 'START_T__team_location', 
        'START_CT_player1_nationality', 'START_CT_player2_nationality',
        'START_CT_player3_nationality', 'START_CT_player4_nationality',
        'START_CT_player5_nationality', 'START_T_player1_nationality',
        'START_T_player2_nationality', 'START_T_player3_nationality',
        'START_T_player4_nationality', 'START_T_player5_nationality']
    X[L_CAT_FEATURES_V3] = X[L_CAT_FEATURES_V3].fillna('default').astype('category')
    X['serie_tier'] = X['serie_tier'].fillna('default').astype('category')

    X[['START_CT__lineup', 'START_T__lineup']] = X[['START_CT__lineup', 'START_T__lineup']].fillna('default').astype('category')

    # X_obj = X.select_dtypes('category').astype('object')
    # L_obj_keys = X_obj.columns
    # for cmb in itertools.combinations(L_obj_keys, 2):
    #     cmb= list(cmb)
    #     new_key = '-'.join([str(x) for x in cmb])    
    #     X[new_key] = X_obj[cmb].astype('str').apply(lambda x: '-'.join(x), axis = 1).astype('category')
    # del X_obj
    # gc.collect()

    return {'features':X, 'targets':Y}

class FeatureSelector():
    def __init__(self):
        pass

    def fit(self, X_tr, X_ho, y_tr, y_ho):

        # оптимизация гиперпараметров
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)        
        model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        best_i = model.best_iteration_
        params['iterations'] = best_i
        params['verbose'] =0
        model = cb.CatBoostClassifier(**params)    

        cb_opt = CatBoostOptimizer(
                        scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                        const_params=params,
                        seed=SEED, 
                        direction='maximize',
                        n_trials=30
            )
        cb_opt.fit(X_tr, y_tr)
        best_params = cb_opt.transform()
        params.update(best_params)
        params['verbose'] = 0
        params['random_state'] = SEED
        best_params = params.copy()

        # четвертая стадия отбора (важность признаковпри перемешивании)
        i = 1
        while True:
            
            print('> permutation importance iter#{}. n_features = {}'.format(i, X_tr.shape[1]))    

            # ДО 
            params = best_params.copy()        
            params['cat_features'] = np.where(X_tr.dtypes=='category')[0]   
            params['iterations'] = best_i 
            model = cb.CatBoostClassifier(**params)
            model.fit(X_tr, y_tr)

            ho_score_before=roc_auc_score(y_ho, model.predict_proba(X_ho)[:, 1])        

            # отбор
            L_perm_imp = []
            for j in tqdm.tqdm(range(10)):
                d_perm_imp = permutation_importance(model, X_ho, y_ho, scoring='roc_auc', n_repeats=1, random_state = SEED+j, n_jobs=-1)
                L_perm_imp.append(d_perm_imp['importances_mean'].flatten())
            arr_perm_imp_mean =np.r_[L_perm_imp].mean(0)
            # отобранные признаки
            idx_selected = np.where(arr_perm_imp_mean>0)[0]

            # ПОСЛЕ
            params_c=params.copy()
            params_c['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
            params_c['iterations'] = best_i 
            model = cb.CatBoostClassifier(**params_c)
            model.fit(X_tr.iloc[:, idx_selected], y_tr)            
            ho_score_after = roc_auc_score(y_ho, model.predict_proba(X_ho.iloc[:, idx_selected])[:, 1])

            # если метрика улучшилась, продолжаем 
            print('\t> score before: {:.2f}, score after: {:.2f}'.format(ho_score_before, ho_score_after))
            if ho_score_after > ho_score_before:
                best_score = ho_score_after
                best_features= X_tr.columns
                best_params = params_c
                X_tr, X_ho = X_tr.iloc[:, idx_selected], X_ho.iloc[:, idx_selected]
                i+=1
            #  если нет, останавливаемся
            else:
                break
        self.best_features = X_tr.columns
        self.best_params = best_params
        self.X_tr, self.X_ho = X_tr, X_ho
        del X_tr, X_ho
        self.y_tr, self.y_ho = y_tr, y_ho
        del y_tr, y_ho
        return self

    def evaluate(self, X_te, y_te):
        model = cb.CatBoostClassifier(**fs.best_params)
        X_trho = pd.concat([self.X_tr[fs.best_features], self.X_ho[fs.best_features]])
        y_trho = pd.concat([self.y_tr, self.y_ho]) 
        assert (X_trho.index == y_trho.index).all()
        X_te_c = X_te[fs.best_features]
        model.fit(X_trho, y_trho)
        return roc_auc_score(y_te, model.predict_proba(X_te_c)[:, 1])


In [2]:
# директория с коллекцией респонсов
PATH_TO_RESPONSES = 'L_games_collection'
# директория с признаками
PATH_TO_FEATURES = r'D:\L_features_25042022'
L_FILENAMES = os.listdir(PATH_TO_FEATURES)
# размер батча подготовки признаков
BATCH_SIZE = 500
# батчи признаков
L_BATCHES = np.array_split(L_FILENAMES, np.int32(np.ceil(len(L_FILENAMES)/BATCH_SIZE)))
# итерации бустинга
CONST_PARAMS= {
    'iterations':1000,
    'loss_function':'Logloss',    
    'verbose':1,
}
# сид рандома
SEED = 13
# доля тестовой части
TEST_SIZE= .05
# доля отложенной части
HOLD_SIZE = .1
# размер батча 
BATCH_RATE = .05

maps_str= """
Vertigo
Inferno
Nuke
Dust2
Mirage 
Ancient 
Overpass
"""

teams_str=\
"""
Natus Vincere
Gambit
NIP
Vitality
G2
FaZe
Heroic
Astralis
Virtus.pro
OG
ENCE
BIG
Liquid
Movistar Riders
Copenhagen Flames
FURIA
mousesports
forZe
Spirit
Entropiq
Complexity
Sinners
Fiend
SKADE
GODSENT
fnatic
Lyngby Vikings
DBL PONEY
paiN
Dignitas
Bad News Bears
Evil Geniuses
TeamOne
Sharks
00Nation
Bravos
Havan Liberty
MIBR
FATE
eSuba
ECLOT
Entropiq Prague
OPAA
AaB
MASONIC
Tricked
AGF
Astralis Talent
HAVU
KOVA
hREDS
SJ
LDLC
Sprout
cowana
NLG
TTC
BIG Academy
AGO
Wisla Krakow
Anonymo
Izako Boars
HONORIS
PACT
sAw
SAW Youngsters
FTW
OFFSET
Nexus
ONYX
4glory
Enterprise
GamerLegion
Galaxy Racer
Apeks
AURA
Young Ninjas
Lilmix
Eternal Fire
Sangal
Endpoint
1WIN
K23
INDE IRAE
AVE
Singularity
NAVI Junior
Spirit Academy
VP.Pridigy
Trasko
EC Kyiv
B8
TyLoo
ViCi
Lynn Vision
Invictus
Checkmate
D13
Renegades
"""
#############################################################################################################

time.sleep(1)
print('> collecting responses ...')
# коллекция респонсов
L_COLLECTION = get_game_collection(PATH_TO_RESPONSES)

time.sleep(1)
print('> preparing team/map to use ...')
d_map_id2name = {}
d_team_id2name = {}
for d_rsp in tqdm.tqdm(L_COLLECTION):
    try:
        d_map_id2name[d_rsp['map']['id']] = str.lower(d_rsp['map']['name']).strip()
        for t in d_rsp['teams']:
            d_team_id2name[t['id']] = str.lower(t['name']).strip()
    except:
        pass
d_map_name2id={v:k for k, v in d_map_id2name.items()}
d_team_name2id={v:k for k, v in d_team_id2name.items()}
L_maps2use = [str.lower(x.strip()) for x in maps_str.split('\n')][1:-1]
L_map_id2use = [d_map_name2id[map_name] for map_name in L_maps2use]
L_teams2use = [str.lower(x.strip()) for x in teams_str.split('\n')][1:-1]
L_team_id2use = []
for team_name in L_teams2use:
    if team_name in d_team_name2id.keys():
        L_team_id2use.append(d_team_name2id[team_name])

time.sleep(1)
print('> preparing team/player profiles ...')
# профайлинг игроков и команд в играх
d_profile = get_profiles(L_COLLECTION)
df_player_profile, df_team_profile = d_profile['player'], d_profile['team']

time.sleep(1)
print('> extracting features for new games ...')
# идентификаторы игр
L_GAME_IDXS = np.unique(df_player_profile['id'])[::-1]
# обработанные игры
s_in=set([int(x.split('.')[0]) for x in L_FILENAMES])
# все игры
s_all= set(L_GAME_IDXS)
# новые игры
s_new= s_all-s_in
# идентификаторы новых игр
L_GAME_IDXS = list(s_new)
# создание признаков
get_features(df_player_profile, L_GAME_IDXS)

time.sleep(1)
print('> preparing features ...')
# батчи признаков
L_BATCHES = np.array_split(L_FILENAMES, np.int32(np.ceil(len(L_FILENAMES)/BATCH_SIZE)))
# таблица с признаками
df_features = pd.DataFrame()
for batch in  tqdm.tqdm(L_BATCHES):
    L = []
    for fnm in batch:
        pth = os.path.join(PATH_TO_FEATURES, fnm)
        with open(pth, 'rb') as f:
            d = pickle.load(f)
        L.append(d)
        del d
    df = pd.DataFrame.from_records(L).apply(reduce_mem_usage)
    del L
    df_features = df_features.append(df)
    del df
df_features.to_pickle('df_features.pickle')

time.sleep(1)
print('> preparing targets ...')
# игры 
L_GAMES2USE = np.unique(df_features['id'])
# таблица с целевыми переменными
df_targets = get_targets(L_COLLECTION, L_GAMES2USE)

time.sleep(1)
print('> preparing dataset ...')
# подготовка датасета 
d_data = prepare_data(df_targets, df_features)
del df_features, df_targets
gc.collect()
X, Y = d_data['features'], d_data['targets']
del d_data
gc.collect()
X.to_pickle('X.pickle'), Y.to_pickle('Y.pickle')
#############################################################################################################

# датасет
X, Y = pd.read_pickle('X.pickle'), pd.read_pickle('Y.pickle')
X_num = X.select_dtypes('number').fillna(-9999)
X_cat = X.select_dtypes(exclude = ['number'])
del X
gc.collect()
X = pd.concat([X_cat, X_num], 1)
del X_cat, X_num
gc.collect()
#############################################################################################################

# выполенеие пайплайна для целевых переменных
for target_i, target_key in enumerate(Y.columns):
    
    time.sleep(1)
    print('> target#{}/{}. {}'.format(target_i+1, Y.shape[1], target_key))  

    try:

        # целевая переменная
        y= Y[target_key]

        # сплит
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, shuffle =False, test_size = TEST_SIZE)
        X_tr, X_ho, y_tr, y_ho = train_test_split(X_tr, y_tr, shuffle =False, test_size = HOLD_SIZE)
        assert (X_tr.index==y_tr.index).all()
        gc.collect()

        # первая стадия отбора признаков
        while True:

            # число строк
            n_rows = X_tr.shape[0]
            # число столбцов
            n_cols= X_tr.shape[1]
            # бъем столбцы на батчи по 10% от исходного числа столбцов
            col_batch_size = np.int32(np.ceil(n_rows*BATCH_RATE))
            n_batches = np.int32(np.ceil(n_cols/col_batch_size))
            L_FEATURE_BATCHES = np.array_split(X_tr.columns, n_batches)

            if len(L_FEATURE_BATCHES)==1:
                break
            else:

                # отобранные признаки
                L_feat2use =[]
                for i, batch in enumerate(L_FEATURE_BATCHES[::-1]):

                    print('> batch#{}/{}'.format(i+1, len(L_FEATURE_BATCHES)))
                    
                    # батч признаков
                    x_tr_batch, x_ho_batch= X_tr[batch], X_ho[batch]
                    
                    # модель
                    params = CONST_PARAMS.copy()
                    params['cat_features'] = np.where(x_tr_batch.dtypes=='category')[0]
                    params['thread_count'] = -1
                    model = cb.CatBoostClassifier(**params)  
                        
                    # обучение
                    model.fit(x_tr_batch, y_tr, eval_set=(x_ho_batch, y_ho), early_stopping_rounds=50)

                    # отобранные признаки
                    mask = model.feature_importances_>0
                    
                    # добавление отобранных признаков
                    L_feat2use.extend(batch[mask].tolist())
                    del x_tr_batch, x_ho_batch  

                X_tr_c, X_ho_c = X_tr[L_feat2use], X_ho[L_feat2use]
                del X_tr, X_ho
                X_tr, X_ho = X_tr_c, X_ho_c
                del X_tr_c, X_ho_c
                gc.collect()

        # вторая стадия отбра признаков (рекурсивный отбор с ранней остановкой)
        i = 1
        while True:

            print('> iter#{}'.format(i))

            # параметры
            params = CONST_PARAMS.copy()
            params['cat_features'] = np.where(X_tr.dtypes=='category')[0]

            # модель
            model = cb.CatBoostClassifier(**params)        
            
            # обучение
            model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)

            # маска отборанных признаков
            mask = model.feature_importances_>0

            # если все признаки отобраны, останавливаемся
            if np.all(mask):
                break

            # если нет, обновляем признаки, повторяем
            else:
                X_tr_c, X_ho_c = X_tr.loc[:, mask], X_ho.loc[:, mask]
                del X_tr, X_ho
                X_tr, X_ho = X_tr_c, X_ho_c
                del X_tr_c, X_ho_c
                best_features = X_tr.columns
                i+=1
                
        X_te = X_te[best_features]
        with open('best_features.pickle', 'wb') as f:
            pickle.dump(best_features, f)

        # добавление признаков
        # 1. комбинации для категорий
        # 2. бинаризация для чисел

        # категории
        X_cat_tr = X_tr.select_dtypes('category').astype('object')
        cat_features = X_cat_tr.columns
        # числа
        X_num_tr = X_tr.drop(X_cat_tr.columns, 1)
        num_features = X_num_tr.columns

        X_cat_ho = X_ho[cat_features].astype('object')
        X_num_ho = X_ho[num_features]
        X_cat_te = X_te[cat_features].astype('object')
        X_num_te = X_te[num_features]

        # комбинации из 2ух, 3ех
        for i in tqdm.tqdm([2, 3]):
            for cmb in itertools.combinations(cat_features, i):
                cmb = list(cmb)
                new_key = '-'.join([str(x) for x in cmb])
                X_cat_tr[new_key] = X_cat_tr[cmb].astype(str).apply(lambda x: '-'.join(x), axis = 1)
                X_cat_ho[new_key] = X_cat_ho[cmb].astype(str).apply(lambda x: '-'.join(x), axis = 1)
                X_cat_te[new_key] = X_cat_te[cmb].astype(str).apply(lambda x: '-'.join(x), axis = 1)

        # бинаризация чисел
        # бакеты
        L_percentile = np.around(np.linspace(2.5, 97.5, 10), 1)
        # для каждого числового признака
        for key in tqdm.tqdm(X_num_tr.columns):
            
            # тренировочная, отложенная, тестовая части
            ser_tr, ser_ho, ser_te = X_num_tr[key], X_num_ho[key], X_num_te[key]
            
            # бины
            bins = np.percentile(ser_tr, L_percentile)
            
            # бинаризация (l1-норма между значением и бинами)
            x_bin_tr = ser_tr.apply(lambda x: np.abs(x-bins).argmin())
            x_bin_ho = ser_ho.apply(lambda x: np.abs(x-bins).argmin())
            x_bin_te = ser_te.apply(lambda x: np.abs(x-bins).argmin())
            
            # добавление нового признака
            X_num_tr[f'{key}_bin'] = x_bin_tr.astype('category')
            X_num_ho[f'{key}_bin'] = x_bin_ho.astype('category')
            X_num_te[f'{key}_bin'] = x_bin_te.astype('category')
            del x_bin_tr, x_bin_ho, x_bin_te

        X_tr = pd.concat([X_cat_tr.astype('category'), X_num_tr], 1)
        X_ho = pd.concat([X_cat_ho.astype('category'), X_num_ho], 1)
        X_te = pd.concat([X_cat_te.astype('category'), X_num_te], 1)

        # третья стадия отбра признаков (рекурсивный отбор с ранней остановкой)
        i = 1
        while True:

            print('> iter#{}'.format(i))

            params = CONST_PARAMS.copy()
            params['cat_features'] = np.where(X_tr.dtypes=='category')[0]

            model = cb.CatBoostClassifier(**params)        
            model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)

            mask = model.feature_importances_>0
            if np.all(mask):
                break
            else:
                X_tr_c, X_ho_c = X_tr.loc[:, mask], X_ho.loc[:, mask]
                del X_tr, X_ho
                X_tr, X_ho = X_tr_c, X_ho_c
                del X_tr_c, X_ho_c
                best_features = X_tr.columns
                i+=1
                
        X_te = X_te[best_features]
        with open('best_features.pickle', 'wb') as f:
            pickle.dump(best_features, f)

        # оптимизация гиперпараметров
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)        
        model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        params['iterations'] = model.best_iteration_
        params['verbose'] =0
        model = cb.CatBoostClassifier(**params)    

        cb_opt = CatBoostOptimizer(
                        scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                        const_params=params,
                        seed=SEED, 
                        direction='maximize',
                        n_trials=30
            )
        cb_opt.fit(X_tr, y_tr)
        best_params = cb_opt.transform()
        params.update(best_params)
        params['verbose'] = 0
        params['random_state'] = SEED
        best_params = params.copy()

        # четвертая стадия отбора (важность признаковпри перемешивании)
        i = 1
        while True:
            
            print('> permutation importance iter#{}. n_features = {}'.format(i, X_tr.shape[1]))    

            # ДО 
            params = best_params.copy()        
            params['cat_features'] = np.where(X_tr.dtypes=='category')[0]    
            model = cb.CatBoostClassifier(**params)
            model.fit(X_tr, y_tr)

            ho_score_before=roc_auc_score(y_ho, model.predict_proba(X_ho)[:, 1])
            te_score_before=roc_auc_score(y_te, model.predict_proba(X_te)[:, 1])

            # отбор
            L_perm_imp = []
            for j in tqdm.tqdm(range(20)):
                d_perm_imp = permutation_importance(model, X_ho, y_ho, scoring='roc_auc', n_repeats=1, random_state = SEED+j, n_jobs=-1)
                L_perm_imp.append(d_perm_imp['importances_mean'].flatten())
            arr_perm_imp_mean =np.r_[L_perm_imp].mean(0)
            # отобранные признаки
            idx_selected = np.where(arr_perm_imp_mean>0)[0]

            # ПОСЛЕ
            params_c=params.copy()
            params_c['cat_features'] = np.where(X_tr.iloc[:, idx_selected].dtypes=='category')[0]
            model = cb.CatBoostClassifier(**params_c)
            model.fit(X_tr.iloc[:, idx_selected], y_tr)
            params_c['iterations'] = model.best_iteration_
            ho_score_after = roc_auc_score(y_ho, model.predict_proba(X_ho.iloc[:, idx_selected])[:, 1])
            te_score_after = roc_auc_score(y_te, model.predict_proba(X_te.iloc[:, idx_selected])[:, 1])

            # если метрика улучшилась, продолжаем 
            print('\t> score before: {:.2f}, score after: {:.2f}'.format(ho_score_before, ho_score_after))
            if ho_score_after > ho_score_before:
                best_score = ho_score_after
                best_features= X_tr.columns
                X_tr, X_ho, X_te = X_tr.iloc[:, idx_selected], X_ho.iloc[:, idx_selected], X_te.iloc[:, idx_selected]
                i+=1
            #  если нет, останавливаемся
            else:
                break
        with open('best_features.pickle', 'wb') as f:
            pickle.dump(best_features, f)

        # оптимизация гиперпараметров
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        model = cb.CatBoostClassifier(**params)        
        model.fit(X_tr, y_tr, eval_set=(X_ho, y_ho), early_stopping_rounds=50)
        params = CONST_PARAMS.copy()
        params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
        params['iterations'] = model.best_iteration_
        params['verbose'] =0
        model = cb.CatBoostClassifier(**params)    

        cb_opt = CatBoostOptimizer(
                        scoring_func= lambda y, y_proba: roc_auc_score(y, y_proba),
                        const_params=params,
                        seed=SEED, 
                        direction='maximize',
                        n_trials=50
            )
        cb_opt.fit(X_tr, y_tr)
        best_params = cb_opt.transform()
        params.update(best_params)
        params['verbose'] = 0
        params['random_state'] = SEED
        best_params = params.copy()

        model = cb.CatBoostClassifier(**best_params) 
        X_trho = pd.concat([X_tr, X_ho], 0)
        y_trho= y.loc[X_trho.index]

        with open('L_res.pickle', 'wb') as f:
            pickle.dump([X_trho, y_trho, X_te, y_te, model], f)

        # ансамбль
        X_tr, y_tr = X_trho, y_trho
        del X_trho, y_trho
        gc.collect()
        params = model.get_params()

        # модели ансамбля
        L_models = []
        for i in range(20):
            params_c = params.copy()
            params_c['random_state'] = SEED+i
            model = cb.CatBoostClassifier(**params_c)
            L_models.append(model)

        # точность моделей
        L_te_probas = []
        for model in tqdm.tqdm(L_models):
            model.fit(X_tr, y_tr)
            te_proba = model.predict_proba(X_te)[:, 1].flatten()
            L_te_probas.append(te_proba)
        y_te_probas = np.c_[L_te_probas].T
        for i in range(y_te_probas.shape[1]):
            print('> model#{}. score = {}'.format(i+1, roc_auc_score(y_te, y_te_probas[:,i])))

        # смешиваем комбинации от 1 до 10 моделей
        n_models = y_te_probas.shape[1]
        best_score = -np.inf
        for i in range(1, 11):
            for cmb in itertools.combinations(np.arange(n_models), i):
                cmb=list(cmb)
                score = roc_auc_score(y_te, y_te_probas[:, cmb].mean(1)) 
                if score > best_score:
                    best_score=score
                    best_cmb = cmb
                    ensemble = np.array(L_models)[cmb]
        print('> ensemble. model count: {}, score = {}'.format(len(ensemble), best_score))

        # весь датасет
        X_full = pd.concat([X_tr, X_te], 0)
        y_full = y.loc[X_full.index]
        del X_tr, X_te
        # обученные модели
        L_fitted_models = []
        for model in tqdm.tqdm(ensemble):
            model.fit(X_full, y_full)
            L_fitted_models.append(model)    
        # признаки
        L_all_features = X_full.columns
        L_fs4ct, L_fs4t = [], []
        for f in L_all_features:
            if 'START_CT' in f:
                L_fs4ct.append(f)
            else:
                L_fs4t.append(f)
        assert (len(L_fs4t)+len(L_fs4ct))==len(L_all_features)

        #############################################################################################################
        # подготовка предсказаний
        df_data4model = X[['map_id', 'START_CT__team_id', 'START_T__team_id']].sort_index()

        L_answers = []
        for map_id in tqdm.tqdm(L_map_id2use):
            subdf = df_data4model.query('map_id==@map_id')
            for i in tqdm.tqdm(range(len(L_team_id2use))):
                for j in range(i+1, len(L_team_id2use)):
                    team1_id, team2_id= L_team_id2use[i], L_team_id2use[j]
                    
                    subdf_ct = subdf.query('START_CT__team_id==@team1_id')
                    subdf_t = subdf.query('START_T__team_id==@team2_id')
                    if (len(subdf_ct)!=0)&(len(subdf_t)!=0):
                        idx4ct = subdf_ct.index[-1]
                        idx4t = subdf_t.index[-1]
                        d_fs4gm = X_full.loc[idx4ct, L_fs4ct].to_dict()
                        d_fs4gm.update(X_full.loc[idx4t, L_fs4t].to_dict())
                        x_new = pd.DataFrame.from_records([d_fs4gm])[L_all_features]
                        proba = np.mean([model.predict_proba(x_new)[0][1] for model in L_fitted_models])
                        d = {
                            'map':d_map_id2name[map_id],
                            'start_ct':d_team_id2name[team1_id], 
                            'start_t':d_team_id2name[team2_id],
                            'start_ct_win_proba':proba
                            }
                        L_answers.append(d)

                    subdf_ct = subdf.query('START_CT__team_id==@team2_id')
                    subdf_t = subdf.query('START_T__team_id==@team1_id')
                    if (len(subdf_ct)!=0)&(len(subdf_t)!=0):
                        idx4ct = subdf_ct.index[-1]
                        idx4t = subdf_t.index[-1]
                        d_fs4gm = X_full.loc[idx4ct, L_fs4ct].to_dict()
                        d_fs4gm.update(X_full.loc[idx4t, L_fs4t].to_dict())
                        x_new = pd.DataFrame.from_records([d_fs4gm])[L_all_features]
                        proba = np.mean([model.predict_proba(x_new)[0][1] for model in L_fitted_models])
                        d = {
                            'map':d_map_id2name[map_id],
                            'start_ct':d_team_id2name[team2_id], 
                            'start_t':d_team_id2name[team1_id],
                            'start_ct_win_proba':proba
                            }
                        L_answers.append(d)    
        df_answers = pd.DataFrame.from_records(L_answers)
        del L_answers
        gc.collect()
        df_answers.to_csv(r'C:\Users\Sergey\anaconda3\Scripts\answers\df_answers_{}.txt'.format(target_key))

        del df_answers, X_full, y_full, L_fitted_models
        gc.collect()

        break
    except:
        pass


  0%|          | 12/57188 [00:00<07:59, 119.13it/s]

> collecting responses ...


100%|██████████| 57188/57188 [00:32<00:00, 1749.06it/s]
100%|██████████| 57187/57187 [00:00<00:00, 316796.14it/s]

> preparing team/map to use ...



  0%|          | 76/57187 [00:00<01:18, 725.75it/s]

> preparing team/player profiles ...


100%|██████████| 57187/57187 [04:11<00:00, 227.29it/s]
100%|██████████| 69256/69256 [02:53<00:00, 399.18it/s]
100%|██████████| 69256/69256 [01:42<00:00, 672.97it/s]
0it [00:00, ?it/s]

> extracting features for new games ...



  0%|          | 0/70 [00:00<?, ?it/s]

> preparing features ...


 24%|██▍       | 17/70 [09:51<32:39, 36.97s/it]