In [1]:
import os, tqdm, json, pickle, gc, zipfile, itertools, time
import pandas as pd
import numpy as np
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from multiprocessing import Pool
import catboost as cb
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, ParameterGrid, StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
import optuna
from optuna.samplers import TPESampler
from tqdm.contrib.concurrent import process_map  
import seaborn as sns
import matplotlib.pyplot as plt
import shap 
from sklearn.model_selection import KFold
from nancorrmp.nancorrmp import NaNCorrMp
# from pathos.multiprocessing import ProcessingPool as Pool
import multiprocessing as mp
from datetime import datetime
from optuna.samplers import TPESampler

### 1. респонсы

In [2]:
PATH_TO_RESPONSES = 'L_games_collection'

L_FILENAMES = os.listdir(PATH_TO_RESPONSES)
L_RESPONSES = []
for fnm in tqdm.tqdm(L_FILENAMES):
    try:
        pth = os.path.join(PATH_TO_RESPONSES, fnm)
        with open(pth, 'r') as f:
            d_rsp = json.load(f)
        L_RESPONSES.append(d_rsp)
    except:
        pass
idx_ordered = np.argsort([d_game['id'] for d_game in L_RESPONSES])[::-1]
L_RESPONSES = np.array(L_RESPONSES)[idx_ordered].tolist()

100%|██████████| 57985/57985 [00:54<00:00, 1061.13it/s]


### 2. профайлинг игроков

In [3]:
def add_profile(d_game):

    def add_global_info(d_game):

        d = {}

        d['id'] = d_game['id'] 
        d['date'] = parser.parse(d_game['begin_at'])
        d['year'] = d['date'].year
        d['month'] = d['date'].month
        d['day'] = d['date'].day
        d['weekday'] = d['date'].weekday()
        d['hour'] = d['date'].hour
        d['number_of_games'] = d_game['match']['number_of_games']
        d['map_id'] = d_game['map']['id']
        d['league_id'] = d_game['match']['league']['id']
        d['serie_id'] = d_game['match']['serie']['id']
        d['tournament_id'] = d_game['match']['tournament']['id']
        d['serie_tier'] = d_game['match']['serie']['tier']

        return d
        
    # идентификаторы актуальных карт
    l_map2use = [1, 2, 6, 7, 8, 20, 31]
    # ключи со статистикой игрока
    l_stat_keys = ['adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 
                   'headshots', 'k_d_diff', 'kast', 'kills', 'rating']
    l_stat_keys_v2 = ['assists', 'deaths', 'flash_assists', 'headshots', 'kills']


    # информация об игре
    d_info = add_global_info(d_game)
    
    if d_info['map_id'] in l_map2use:  

        d_r1 = d_game['rounds'][0]
        if d_r1['round']==1:
            
            # информация о раундах
            df_rounds = pd.DataFrame.from_records(d_game['rounds'])
            start_ct_id =d_r1['ct']   
            winner_id = df_rounds['winner_team'].value_counts().idxmax()
            maxround = df_rounds['round'].max()
            d_h1_win_count = df_rounds.query('round<=15')['winner_team'].value_counts().to_dict()
            d_h2_win_count = df_rounds.query('round>15')['winner_team'].value_counts().to_dict()
            d_h1_outcome_count = df_rounds.query('round<=15')['outcome'].value_counts().to_dict()
            d_h2_outcome_count = df_rounds.query('round>15')['outcome'].value_counts().to_dict()        

            L = []
            counter = 0
            # информация об игроках
            for p in d_game['players']:
                counter+=1

                d = {}
                d.update(d_info)

                # идентификатор игрока
                d['player_id'] = p['player']['id']
                # идентификатор команды
                d['team_id'] = p['team']['id']
                # идентификатор оппонента
                d['opponent_id'] = p['opponent']['id']

                # национальность игрока
                d['player_nationality']  = p['player']['nationality']
                # дата рождения игрока
                d['player_birthday']  = p['player']['birthday']
                # страна команды
                d['team_location']  = p['team']['location']

                # сторона начала
                d['start_ct']= 1 if start_ct_id==d['team_id'] else 0
                # победа
                d['win'] = 1 if winner_id==d['team_id'] else 0
                # все раундов в игре
                d['maxround'] = maxround

                # число выигранных раундов в 1-ой половине игры
                try:
                    d['h1_win_count'] = d_h1_win_count[d['team_id']]
                except:
                    d['h1_win_count'] = 0 
                # число выигранных раундов во 2-ой половине игры
                try:
                    d['h2_win_count'] = d_h2_win_count[d['team_id']]
                except:
                    d['h2_win_count'] = 0 
                # исходы раундов в 1-ой половине игры
                for k, v in d_h1_outcome_count.items():
                    d[f'h1_outcome_{k}_count'] = v
                # исходы раундов во 2-ой половине игры
                for k, v in d_h2_outcome_count.items():
                    d[f'h2_outcome_{k}_count'] = v            

                # статистика игрока
                d.update({k:p[k] if pd.notnull(p[k]) else 0.0 for k in l_stat_keys})
                d.update({f'{k}_per_round':p[k]/maxround if pd.notnull(p[k]) else 0.0 for k in l_stat_keys_v2})

                L.append(d)
            if counter==10:
                return L
            else:
                return None
        else:
            return None

L_profile = []
for d_rsp in tqdm.tqdm(L_RESPONSES):
    try:
        L_profile.extend(add_profile(d_rsp))
    except:
        pass
df_profile = pd.DataFrame.from_records(L_profile)
del L_profile
gc.collect()
df_profile.to_pickle('df_profile.pickle')

df_profile.head()

100%|██████████| 57984/57984 [04:24<00:00, 218.93it/s] 


Unnamed: 0,id,date,year,month,day,weekday,hour,number_of_games,map_id,league_id,...,kast,kills,rating,assists_per_round,deaths_per_round,flash_assists_per_round,headshots_per_round,kills_per_round,h1_outcome_timeout_count,h2_outcome_timeout_count
0,72414,2022-05-08 00:10:00+00:00,2022,5,8,6,0,1,20,4243,...,63.0,19,1.0,0.074074,0.62963,0.037037,0.259259,0.703704,,
1,72414,2022-05-08 00:10:00+00:00,2022,5,8,6,0,1,20,4243,...,74.1,16,1.05,0.259259,0.703704,0.037037,0.185185,0.592593,,
2,72414,2022-05-08 00:10:00+00:00,2022,5,8,6,0,1,20,4243,...,74.1,27,1.52,0.148148,0.666667,0.0,0.518519,1.0,,
3,72414,2022-05-08 00:10:00+00:00,2022,5,8,6,0,1,20,4243,...,85.2,23,1.33,0.185185,0.555556,0.074074,0.333333,0.851852,,
4,72414,2022-05-08 00:10:00+00:00,2022,5,8,6,0,1,20,4243,...,63.0,23,1.14,0.148148,0.740741,0.074074,0.148148,0.851852,,


### 3. датасет
* признаки
* таргет (команда, начинающая игру за кт, выиграет)

In [42]:
def reduce_mem_usage(series):
    try:
        col_type = series.dtype

        if col_type != object:
            c_min = series.min()
            c_max = series.max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    series = series.astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    series = series.astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    series = series.astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    series = series.astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    series = series.astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    series = series.astype(np.float32)
                else:
                    series = series.astype(np.float64)
        else:
            pass 
    except:
        pass
    
    return series 

def add_features(df_history, L_AGG_KEYS, prefix):
    d = {}
    d.update(df_history[L_AGG_KEYS].add_prefix(prefix).add_suffix('__mean').mean().to_dict())
    d.update(df_history[L_AGG_KEYS].add_prefix(prefix).add_suffix('__sum').sum().to_dict())
    return d

L_AGG_KEYS = [
    
    'win', 'maxround',
    
    'h1_win_count', 'h2_win_count',
    'h1_outcome_eliminated_count', 'h1_outcome_exploded_count',
    'h1_outcome_defused_count', 'h2_outcome_eliminated_count',
    'h2_outcome_exploded_count', 'h2_outcome_defused_count',
    'h1_outcome_timeout_count','h2_outcome_timeout_count',

    'adr','assists', 'deaths', 'first_kills_diff',
    'flash_assists', 'headshots',
    'k_d_diff', 'kast', 'kills', 'rating',
    
    'assists_per_round',
    'deaths_per_round', 'flash_assists_per_round', 'headshots_per_round',
    'kills_per_round'
]

L_GLOBAL_KEYS = [
    'id', 'year', 'month', 'day', 'weekday', 'hour',
    'number_of_games', 'map_id', 'league_id', 'serie_id', 'tournament_id',
    'serie_tier'
]

L_GAME_IDXS = np.unique(df_profile['id'])

df_data = pd.DataFrame()
for game_id in tqdm.tqdm(L_GAME_IDXS[-1000:]):
    
    df_game = df_profile.query('id==@game_id')

    date =df_game['date'].iloc[0]
    tournament_id = df_game['tournament_id'].iloc[0]
    year, month, day, weekday, hour = df_game['year'].iloc[0], df_game['month'].iloc[0],\
                                      df_game['day'].iloc[0], df_game['weekday'].iloc[0], df_game['hour'].iloc[0]
    map_id = df_game['map_id'].iloc[0]
    d_team_id2start_ct = dict(zip(df_game['team_id'], df_game['start_ct']))
    d_team_id2opponent_id = dict(zip(df_game['team_id'],  df_game['opponent_id']))
    d_team_id2loc = dict(zip(df_game['team_id'],  df_game['team_location']))

    d_glb = {k:df_game[k].iloc[0] for k in L_GLOBAL_KEYS} 
    for team_id, start_ct in d_team_id2start_ct.items():        
        prefix = 'START_CT' if start_ct==1 else 'START_T'
        d_glb[f'{prefix}__team_id'] = team_id
        d_glb[f'{prefix}__team_location'] = d_team_id2loc[team_id]
        L_p_id = np.unique(df_game.query('team_id==@team_id')['player_id'])
        for perm_i, perm_l_p_id in enumerate(list(itertools.permutations(L_p_id))):
            for p_i, p_id in enumerate(perm_l_p_id):
                d_glb[f'{prefix}__player{p_i+1}_v{perm_i+1}_id'] = p_id

    dd = defaultdict(list)

    for team_id, start_ct in d_team_id2start_ct.items():
        
        prefix = 'START_CT' if start_ct==1 else 'START_T'

        opponent_id=d_team_id2opponent_id[team_id]
        df_history = df_profile.query('(date<@date)&(team_id==@team_id)')

        dd['all_map_all_start'].append(
            add_features(
                df_history,
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['all_map_all_start__tournament'].append(
            add_features(
                df_history.query('tournament_id==@tournament_id'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['all_map_all_start__year'].append(
            add_features(
                df_history.query('year==@year'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['all_map_all_start__month'].append(
            add_features(
                df_history.query('month==@month'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['all_map_all_start__day'].append(
            add_features(
                df_history.query('day==@day'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['all_map_all_start__weekday'].append(
            add_features(
                df_history.query('weekday==@weekday'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['all_map_all_start__hour'].append(
            add_features(
                df_history.query('hour==@hour'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )

        dd['current_map_current_start'].append(
            add_features(
                df_history.query('(map_id==@map_id)&(start_ct==@start_ct)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['current_map_current_start__year'].append(
            add_features(
                df_history.query('(map_id==@map_id)&(start_ct==@start_ct)&(year==@year)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['current_map_current_start__month'].append(
            add_features(
                df_history.query('(map_id==@map_id)&(start_ct==@start_ct)&(month==@month)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['current_map_current_start__day'].append(
            add_features(
                df_history.query('(map_id==@map_id)&(start_ct==@start_ct)&(day==@day)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['current_map_current_start__weekday'].append(
            add_features(
                df_history.query('(map_id==@map_id)&(start_ct==@start_ct)&(weekday==@weekday)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['current_map_current_start__hour'].append(
            add_features(
                df_history.query('(map_id==@map_id)&(start_ct==@start_ct)&(hour==@hour)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )


        dd['pair'].append(
            add_features(
                df_history.query('opponent_id==@opponent_id'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['pair__year'].append(
            add_features(
                df_history.query('(opponent_id==@opponent_id)&(year==@year)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['pair__month'].append(
            add_features(
                df_history.query('(opponent_id==@opponent_id)&(month==@month)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['pair__day'].append(
            add_features(
                df_history.query('(opponent_id==@opponent_id)&(day==@day)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['pair__weekday'].append(
            add_features(
                df_history.query('(opponent_id==@opponent_id)&(weekday==@weekday)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )
        dd['pair__hour'].append(
            add_features(
                df_history.query('(opponent_id==@opponent_id)&(hour==@hour)'),
                L_AGG_KEYS,                
                prefix=f'{prefix}__'
            )
        )

        df_history = df_profile.query('date<@date')
        L_p_id = np.unique(df_game.query('team_id==@team_id')['player_id'])        
        
        for perm_i, perm_l_p_id in enumerate(list(itertools.permutations(L_p_id))[:10]):
            for p_i, p_id in enumerate(perm_l_p_id):  

                dd[f'player{p_i+1}_v{perm_i+1}__in_team'].append(
                    add_features(
                        df_history.query('(player_id==@p_id)&(team_id==@team_id)'),
                        L_AGG_KEYS,
                        prefix=f'{prefix}__'
                    )
                )
                
                
                dd[f'player{p_i+1}_v{perm_i+1}__not_in_team'].append(
                    add_features(
                        df_history.query('(player_id==@p_id)&(team_id!=@team_id)'),
                        L_AGG_KEYS,
                        prefix=f'{prefix}__'
                    )
                )
    l = []           
    for k, v in dd.items():
        d = d_glb.copy()
        d['feature_group'] = k
        for element in v:
            d.update(element)
        d['start_ct_win'] = (df_game.query('start_ct==1')['win']==1).all()
        l.append(d)
    del dd
    df = pd.DataFrame.from_records(l).apply(reduce_mem_usage)
    df_data = df_data.append(df)
    del df
df_data.to_pickle('df_data.pickle')

100%|██████████| 1000/1000 [48:02<00:00,  2.88s/it]


### 4. мл

In [86]:
TARGET_KEY = 'start_ct_win'
TEST_SIZE, VAL_SIZE = .1, .2

df_data = pd.read_pickle('df_data.pickle')
df_data = df_data.set_index('id').sort_index()
X, y = df_data.drop(TARGET_KEY, 1), df_data[TARGET_KEY].astype(np.int32)
del df_data
gc.collect()

L_CAT_FEATURES = [
    'year', 'month', 'day', 'weekday', 'hour', 'number_of_games', 'map_id',
    'league_id', 'serie_id', 'tournament_id', 'serie_tier',
    'START_T__team_id', 'START_T__team_location', 'START_CT__team_id',
    'START_CT__team_location', 'feature_group'
] + X.columns[X.columns.str.contains('_id')&X.columns.str.contains('player')].tolist()
L_NUM_FEATURES = X.drop(L_CAT_FEATURES, 1).columns

X[L_CAT_FEATURES] = X[L_CAT_FEATURES].fillna('default').astype('category')

In [96]:
unique_idxs = np.unique(X.index)
idx_split_trte = np.int32(np.ceil(len(unique_idxs)*.9))
idx_split_trval = np.int32(np.ceil(len(unique_idxs[:idx_split_trte])*.8))

X_tr, y_tr = X.loc[unique_idxs[:idx_split_trte]], y.loc[unique_idxs[:idx_split_trte]]
X_te, y_te = X.loc[unique_idxs[idx_split_trte:]], y.loc[unique_idxs[idx_split_trte:]]

X_tr, y_tr = X.loc[unique_idxs[:idx_split_trte][:idx_split_trval]], y.loc[unique_idxs[:idx_split_trte][:idx_split_trval]]
X_val, y_val = X.loc[unique_idxs[:idx_split_trte][idx_split_trval:]], y.loc[unique_idxs[:idx_split_trte][idx_split_trval:]]

In [97]:
print('train shape: {}'.format(X_tr.shape, y_tr.shape))
print('val shape: {}'.format(X_val.shape, y_val.shape))

train shape: (85680, 1324)
val shape: (21420, 1324)


In [98]:
for f in tqdm.tqdm(L_NUM_FEATURES):
    
    arr = X_tr[f].dropna().values
    bins = np.percentile(arr, np.linspace(2.5, 97.5, 10))
    
    ser_tr = X_tr[f].apply(lambda x:np.abs(x-bins).argmin())
    ser_tr.loc[X_tr[f].isna()]=-1
    X_tr[f'{f}_bin'] = ser_tr.values
    X_tr[f'{f}_bin'] = X_tr[f'{f}_bin'].astype('category')
    del ser_tr

    ser_val = X_val[f].apply(lambda x:np.abs(x-bins).argmin())
    ser_val.loc[X_val[f].isna()]=-1
    X_val[f'{f}_bin'] = ser_val.values
    X_val[f'{f}_bin'] = X_val[f'{f}_bin'].astype('category')
    del ser_val

    ser_te = X_te[f].apply(lambda x:np.abs(x-bins).argmin())
    ser_te.loc[X_te[f].isna()]=-1
    X_te[f'{f}_bin'] = ser_te.values
    X_te[f'{f}_bin'] = X_te[f'{f}_bin'].astype('category')
    del ser_te

100%|██████████| 108/108 [00:27<00:00,  3.99it/s]


In [99]:
X_tr.head()

Unnamed: 0_level_0,year,month,day,weekday,hour,number_of_games,map_id,league_id,serie_id,tournament_id,...,START_CT__headshots__sum_bin,START_CT__k_d_diff__sum_bin,START_CT__kast__sum_bin,START_CT__kills__sum_bin,START_CT__rating__sum_bin,START_CT__assists_per_round__sum_bin,START_CT__deaths_per_round__sum_bin,START_CT__flash_assists_per_round__sum_bin,START_CT__headshots_per_round__sum_bin,START_CT__kills_per_round__sum_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69752,2022,3,30,2,19,3,31,4763,4529,7842,...,9,0,9,9,9,9,9,9,9,9
69752,2022,3,30,2,19,3,31,4763,4529,7842,...,2,5,2,2,2,2,2,2,2,2
69752,2022,3,30,2,19,3,31,4763,4529,7842,...,8,0,8,8,8,8,8,8,8,8
69752,2022,3,30,2,19,3,31,4763,4529,7842,...,6,3,5,5,5,5,5,6,6,5
69752,2022,3,30,2,19,3,31,4763,4529,7842,...,5,4,5,5,5,5,5,5,5,5


In [100]:
CONST_PARAMS= {
            'iterations':1000,
            'loss_function':'Logloss',    
            'verbose':1,
            }
SEED=13
N_PERM_ITER = 20

In [101]:
X_tr, X_val = X_tr.select_dtypes('number'), X_val.select_dtypes('number')

In [102]:
while True:

    params = CONST_PARAMS.copy()
    params['cat_features'] = np.where(X_tr.dtypes=='category')[0]
    params['use_best_model'] = True
    params['random_state'] = SEED

    model = cb.CatBoostClassifier(**params)   
    model.fit(X_tr, y_tr,  eval_set=(X_val, y_val), early_stopping_rounds=100)

    mask = model.feature_importances_>0

    if np.all(mask):
        break
    else:
        X_tr, X_val = X_tr.loc[:, mask], X_val.loc[:, mask]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.095133
0:	learn: 0.6877098	test: 0.6901487	best: 0.6901487 (0)	total: 37.3ms	remaining: 37.3s
1:	learn: 0.6834128	test: 0.6869668	best: 0.6869668 (1)	total: 69.1ms	remaining: 34.5s
2:	learn: 0.6784525	test: 0.6836396	best: 0.6836396 (2)	total: 103ms	remaining: 34.1s
3:	learn: 0.6752975	test: 0.6817695	best: 0.6817695 (3)	total: 134ms	remaining: 33.4s
4:	learn: 0.6715568	test: 0.6807438	best: 0.6807438 (4)	total: 163ms	remaining: 32.5s
5:	learn: 0.6687852	test: 0.6796889	best: 0.6796889 (5)	total: 193ms	remaining: 31.9s
6:	learn: 0.6663914	test: 0.6787724	best: 0.6787724 (6)	total: 222ms	remaining: 31.5s
7:	learn: 0.6641925	test: 0.6780101	best: 0.6780101 (7)	total: 258ms	remaining: 32s
8:	learn: 0.6619104	test: 0.6779974	best: 0.6779974 (8)	total: 289ms	remaining: 31.8s
9:	learn: 0.6595318	test: 0.6769250	best: 0.6769250 (9)	total: 320ms	remaining: 31.7s
10:	learn: 0.6572685	test: 0.6760552	best: 0.6760552 (10)	total: 353ms	remaining: 31.8s
11:	learn: 0.6545236	t

In [103]:
def objective(trial):

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", .5, 1),
        "depth": trial.suggest_int("depth", 3, 11),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli"]
        )
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    param.update(CONST_PARAMS)
    param['cat_features'] = np.where(X_tr.dtypes=='category')[0]
    param['use_best_model'] = True
    params['random_state'] = SEED
    param['verbose']=0

    model = cb.CatBoostClassifier(**param)
    model.fit(X_tr, y_tr,  eval_set=(X_val, y_val), early_stopping_rounds=100)

    score = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])    
    
    return score

In [104]:
while True:

    print('> n_features:{}'.format(X_tr.shape[1]))

    study = optuna.create_study(
                    sampler=TPESampler(),    
                    direction="maximize"
                    )
    study.optimize(objective, n_trials=100, timeout=60*5)

    best_params_before = {}
    best_params_before.update(CONST_PARAMS)
    best_params_before['cat_features'] = np.where(X_tr.dtypes=='category')[0]
    best_params_before['use_best_model'] = True
    best_params_before['random_state'] = SEED
    best_params_before['verbose'] = 0
    best_params_before.update(study.best_trial.params)

    model = cb.CatBoostClassifier(**best_params_before)
    model.fit(X_tr, y_tr,  eval_set=(X_val, y_val), early_stopping_rounds=100)

    ho_score_before = roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

    z_perm_imp = np.zeros((X_tr.shape[1], ))
    for _ in tqdm.tqdm(range(N_PERM_ITER)):        
        
        imp = permutation_importance(
                model,
                X_val, y_val,
                scoring = 'roc_auc',
                n_repeats=1,
                random_state = SEED+_,
                n_jobs=-1
                )['importances_mean'].flatten()
                
        z_perm_imp += imp/N_PERM_ITER

    mask = z_perm_imp>0
    best_params_after = best_params_before.copy()
    best_params_after['cat_features'] = np.where(X_tr.loc[:, mask].dtypes=='category')[0]
    model = cb.CatBoostClassifier(**best_params_after)
    model.fit(X_tr.loc[:, mask], y_tr,  eval_set=(X_val.loc[:, mask], y_val), early_stopping_rounds=100)
    ho_score_after = roc_auc_score(y_val, model.predict_proba(X_val.loc[:, mask])[:, 1])

    if ho_score_after>ho_score_before:
        X_tr, X_val = X_tr.loc[:, mask], X_val.loc[:, mask]
        X_te = X_te[X_tr.columns] 
        best_score = roc_auc_score(y_te, model.predict_proba(X_te)[:, 1])  
        best_features = X_tr.columns
        best_params = best_params_after
        ser_feat_imp = pd.Series(dict(zip(X_tr.columns,model.feature_importances_))).sort_values(ascending = False)
        print('-----------------------------------------------------------------------')
    else:               
        break

[32m[I 2022-05-10 09:03:12,346][0m A new study created in memory with name: no-name-48402171-854b-4c8e-aea0-7e3b1477dc92[0m


> n_features:48


[32m[I 2022-05-10 09:03:25,023][0m Trial 0 finished with value: 0.6375738231916332 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.8809009434026049, 'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4320735458600259}. Best is trial 0 with value: 0.6375738231916332.[0m
[32m[I 2022-05-10 09:03:32,996][0m Trial 1 finished with value: 0.6338086791810149 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.6057476768154604, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.653387771924264}. Best is trial 0 with value: 0.6375738231916332.[0m
[32m[I 2022-05-10 09:03:57,032][0m Trial 2 finished with value: 0.6028920007267158 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.7848525081422009, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.7117147517873406}. Best is trial 0 with value: 0.6375738231916332.[0m
[32m[I 2022-0

-----------------------------------------------------------------------
> n_features:35


[32m[I 2022-05-10 09:10:14,680][0m Trial 0 finished with value: 0.6544720499746065 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.8984163554521145, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.39461410929892}. Best is trial 0 with value: 0.6544720499746065.[0m
[32m[I 2022-05-10 09:10:20,058][0m Trial 1 finished with value: 0.6462387016361817 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.6497636660438206, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.029196343544931613}. Best is trial 0 with value: 0.6544720499746065.[0m
[32m[I 2022-05-10 09:10:44,979][0m Trial 2 finished with value: 0.6479318268921118 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.7876279234869972, 'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6944363278006868}. Best is trial 0 with value: 0.6544720499746065.[0m
[32m[I 2022-0

-----------------------------------------------------------------------
> n_features:31


[32m[I 2022-05-10 09:19:22,834][0m Trial 0 finished with value: 0.629006496060453 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.6531884979441459, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.824733756946215}. Best is trial 0 with value: 0.629006496060453.[0m
[32m[I 2022-05-10 09:20:00,335][0m Trial 1 finished with value: 0.6466094097445936 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.9583352681991368, 'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.82778644231014}. Best is trial 1 with value: 0.6466094097445936.[0m
[32m[I 2022-05-10 09:20:06,423][0m Trial 2 finished with value: 0.6387255320995635 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.7251829219037496, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.6952456097732893}. Best is trial 1 with value: 0.6466094097445936.[0m
[32m[I 2022-05-10 

In [106]:
print('> score: {:.2f}'.format(best_score))
print('> features: {}'.format(best_features.tolist()))
print('> params: {}'.format(best_params))
print('---------------------------------------------------')
print('> feature importances:\n')
for i, (k,v) in enumerate(ser_feat_imp.items()):
    print(f'\t i:{i+1}, feature: {k}, importance: {v}')

> score: 0.58
> features: ['START_T__maxround__mean', 'START_T__h1_outcome_eliminated_count__mean', 'START_T__h1_outcome_defused_count__mean', 'START_T__h2_outcome_exploded_count__mean', 'START_T__h2_outcome_defused_count__mean', 'START_T__h1_outcome_timeout_count__mean', 'START_T__first_kills_diff__mean', 'START_T__win__sum', 'START_T__h2_outcome_eliminated_count__sum', 'START_T__h2_outcome_timeout_count__sum', 'START_T__kast__sum', 'START_CT__win__mean', 'START_CT__maxround__mean', 'START_CT__h1_win_count__mean', 'START_CT__h2_win_count__mean', 'START_CT__h1_outcome_exploded_count__mean', 'START_CT__h1_outcome_defused_count__mean', 'START_CT__h2_outcome_defused_count__mean', 'START_CT__h2_outcome_timeout_count__mean', 'START_CT__deaths__mean', 'START_CT__k_d_diff__mean', 'START_CT__kast__mean', 'START_CT__rating__mean', 'START_CT__deaths_per_round__mean', 'START_CT__flash_assists_per_round__mean', 'START_CT__win__sum', 'START_CT__h1_win_count__sum', 'START_CT__h1_outcome_exploded_cou