# ПРЕДСКАЗАНИЕ РЕЗУЛЬТАТОВ ТУРНИРНЫХ ИГР ПО CS:GO

In [8]:
import os, tqdm, json, pickle, gc, zipfile, itertools, time
import pandas as pd
import numpy as np
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from multiprocessing import Pool
import catboost as cb
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, ParameterGrid, StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
import optuna
from optuna.samplers import TPESampler
from tqdm.contrib.concurrent import process_map  
import seaborn as sns
import matplotlib.pyplot as plt
import shap 
from sklearn.model_selection import KFold
from nancorrmp.nancorrmp import NaNCorrMp
# from pathos.multiprocessing import ProcessingPool as Pool
import multiprocessing as mp
from datetime import datetime
from optuna.samplers import TPESampler

from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Activation
from keras.layers import Concatenate, Reshape, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras import regularizers

In [2]:
def add_profile(d_game):

    def add_global_info(d_game):

        d = {}

        d['id'] = d_game['id'] 
        d['date'] = parser.parse(d_game['begin_at'])        
        d['year'] = d['date'].year
        d['month'] = d['date'].month
        d['day'] = d['date'].day
        d['weekday'] = d['date'].weekday()
        d['hour'] = d['date'].hour
        d['number_of_games'] = d_game['match']['number_of_games']
        d['map_id'] = d_game['map']['id']
        d['league_id'] = d_game['match']['league']['id']
        d['serie_id'] = d_game['match']['serie']['id']
        d['tournament_id'] = d_game['match']['tournament']['id']
        d['serie_tier'] = d_game['match']['serie']['tier']

        return d
        
    # идентификаторы актуальных карт
    l_map2use = [1, 2, 6, 7, 8, 20, 31]
    # ключи со статистикой игрока
    l_stat_keys = ['adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 
                   'headshots', 'k_d_diff', 'kast', 'kills', 'rating']
    l_stat_keys_v2 = ['assists', 'deaths', 'flash_assists', 'headshots', 'kills']


    # информация об игре
    d_info = add_global_info(d_game)
    
    if d_info['map_id'] in l_map2use:  

        d_r1 = d_game['rounds'][0]
        if d_r1['round']==1:
            
            # информация о раундах
            df_rounds = pd.DataFrame.from_records(d_game['rounds'])
            start_ct_id =d_r1['ct']   
            winner_id = df_rounds['winner_team'].value_counts().idxmax()
            maxround = df_rounds['round'].max()
            d_h1_win_count = df_rounds.query('round<=15')['winner_team'].value_counts().to_dict()
            d_h2_win_count = df_rounds.query('round>15')['winner_team'].value_counts().to_dict()
            d_h1_outcome_count = df_rounds.query('round<=15')['outcome'].value_counts().to_dict()
            d_h2_outcome_count = df_rounds.query('round>15')['outcome'].value_counts().to_dict()        

            L = []
            counter = 0
            # информация об игроках
            for p in d_game['players']:
                counter+=1

                d = {}
                d.update(d_info)

                # идентификатор игрока
                d['player_id'] = p['player']['id']
                # идентификатор команды
                d['team_id'] = p['team']['id']
                # идентификатор оппонента
                d['opponent_id'] = p['opponent']['id']

                # национальность игрока
                d['player_nationality']  = p['player']['nationality']
                d['player_hometown']  = p['player']['hometown']
                # дата рождения игрока
                d['player_birthday'] = p['player']['birthday']
                # страна команды
                d['team_location']  = p['team']['location']

                # сторона начала
                d['start_ct']= 1 if start_ct_id==d['team_id'] else 0
                # победа
                d['win'] = 1 if winner_id==d['team_id'] else 0
                # все раундов в игре
                d['maxround'] = maxround

                # число выигранных раундов в 1-ой половине игры
                try:
                    d['h1_win_count'] = d_h1_win_count[d['team_id']]
                except:
                    d['h1_win_count'] = 0 
                # число выигранных раундов во 2-ой половине игры
                try:
                    d['h2_win_count'] = d_h2_win_count[d['team_id']]
                except:
                    d['h2_win_count'] = 0 
                # исходы раундов в 1-ой половине игры
                for k, v in d_h1_outcome_count.items():
                    d[f'h1_outcome_{k}_count'] = v
                # исходы раундов во 2-ой половине игры
                for k, v in d_h2_outcome_count.items():
                    d[f'h2_outcome_{k}_count'] = v            

                # статистика игрока
                d.update({k:p[k] if pd.notnull(p[k]) else 0.0 for k in l_stat_keys})
                d.update({f'{k}_per_round':p[k]/maxround if pd.notnull(p[k]) else 0.0 for k in l_stat_keys_v2})

                L.append(d)
            if counter==10:
                return L
            else:
                return None
        else:
            return None

In [3]:
PATH_TO_RESPONSES = 'L_games_collection'

In [4]:
L_FILENAMES = os.listdir(PATH_TO_RESPONSES)
L_RESPONSES = []
for fnm in tqdm.tqdm(L_FILENAMES):
    try:
        pth = os.path.join(PATH_TO_RESPONSES, fnm)
        with open(pth, 'r') as f:
            d_rsp = json.load(f)
        L_RESPONSES.append(d_rsp)
    except:
        pass
idx_ordered = np.argsort([d_game['id'] for d_game in L_RESPONSES])[::-1]
L_RESPONSES = np.array(L_RESPONSES)[idx_ordered].tolist()

100%|██████████| 57985/57985 [00:20<00:00, 2875.50it/s]


In [6]:
L_GLOBAL_KEYS = [
                 
  'id',
  'date', 'timestamp',
  'year', 'month', 'day', 'weekday', 'hour',
  'number_of_games',
  'map_id',
  'league_id', 'serie_id', 'tournament_id',
  'serie_tier'  
               
]

L_PLAYER_KEYS = [
                 
  'id', 'nationality','hometown', 'age',

  'adr','assists', 'deaths', 'first_kills_diff', 'flash_assists', 'headshots',
  'k_d_diff', 'kast', 'kills', 'rating', 'assists_per_round',
  'deaths_per_round', 'flash_assists_per_round', 'headshots_per_round',
  'kills_per_round'

]

L_TEAM_KEYS = [
               
  'id', 'location',

  'start_ct',
  
  'win', 'maxround',
  
  'h1_win_count',
  'h2_win_count', 'h1_outcome_eliminated_count',
  'h1_outcome_exploded_count', 'h1_outcome_defused_count',
  'h2_outcome_eliminated_count', 'h2_outcome_exploded_count',
  'h2_outcome_defused_count', 'h1_outcome_timeout_count', 'h2_outcome_timeout_count'
]

L_profile = []
for d_rsp in tqdm.tqdm(L_RESPONSES):
    try:
        L_profile.extend(add_profile(d_rsp))
    except:
        pass
df_profile = pd.DataFrame.from_records(L_profile)
df_profile['timestamp'] = df_profile['date'].apply(lambda x: x.timestamp())
df_profile['player_birthday_year'] = df_profile['player_birthday'].astype('datetime64').dt.year
df_profile['player_age'] = df_profile['year'] - df_profile['player_birthday_year']
del L_profile
gc.collect()

df_profile = df_profile.rename(columns = dict(zip(
                                ['player_nationality','player_hometown', 'player_age', 'team_location'],
                                ['nationality','hometown', 'age', 'location'])
                                ))

ser_nunique_players4game = df_profile.groupby('id')['player_id'].nunique()
df_profile = df_profile[df_profile['id'].isin(ser_nunique_players4game[ser_nunique_players4game==10].index)]

assert (df_profile.groupby('id')['player_id'].nunique()==10).all()

L_rows = []
for game_id, subdf in tqdm.tqdm(df_profile.groupby('id')):
  d = subdf[L_GLOBAL_KEYS].iloc[0].to_dict()
  subdf =subdf.sort_values('team_id').drop('id', 1)  
  for i in range(len(subdf)):    
    d.update(subdf.iloc[[i]]\
                  .rename(columns = {'player_id':'id'})\
                  [L_PLAYER_KEYS].add_prefix(f'player{i+1}__')\
                  .iloc[0].to_dict()
              )
  for i, (team_id, subsubdf) in enumerate(subdf.groupby('team_id')):
    d.update(subsubdf\
              .rename(columns = {'team_id':'id'})\
              [L_TEAM_KEYS].add_prefix(f'team{i+1}__')\
              .iloc[0].to_dict()
              )
  L_rows.append(d)     
df_profile = pd.DataFrame.from_records(L_rows) 
del L_rows
gc.collect()

df_profile.to_pickle('df_profile.pickle')

100%|██████████| 34975/34975 [08:55<00:00, 65.31it/s]


### описание датасета

|ключ|описание|
|---|---|
|id|идентификатор игры|
|date|дата|
|year|год|
|month|месяц|
|day|день|
|weekday|день недели|
|hour|час|
|number_of_games|число игр в матче|
|map_id|идентификатор карты|
|league_id|идентификатор лиги|
|serie_id|идентфикатор серии|
|tournament_id|идентфикатор турнира|
|serie_tier|ранг серии|
|player_id|идентификатор игрока|
|team_id|идентификатор команды игрока|
|opponent_id|идентфикатор противоположной команды|
|player_nationality|национальность игрока|
|player_birthday|дата рождения игрока|
|team_location|страна команды|
|start_ct|старт игры за контр-террористов|
|win|победа|
|maxround|число раундов вигре|
|h1_win_count|число выигранных раундов в 1-ой половине игры|
|h2_win_count|число выигранных раундов во 2-ой половине игры|
|h1_outcome_eliminated_count|число полных убийств команды соперника в 1-ой половине игры|
|h1_outcome_exploded_count|число взорвавшихся бомб в 1-ой половине игры|
|h1_outcome_defused_count|число обезвреженных бомб в 1-ой половине игры|
|h2_outcome_eliminated_count|...|
|h2_outcome_exploded_count|...|
|h2_outcome_defused_count|...|
|h1_outcome_timeout_count|число раундов с истекшим временем  в 1-ой половине игры|
|h2_outcome_timeout_count|...|
|adr|средний урон в раунде|
|assists|число помощей в убийствах|
|deaths|число смертей|
|first_kills_diff|разница во времени между первыми убийствами команд|
|flash_assists|число световых гранат, после которых были совершены убийства|
|headshots|число убиййств в голову|
|k_d_diff|отношение числа убийств к числу смертей?|
|kast|% раундов, в которых игрок имел убийство, ассист, выжил или был разменен|
|kills|число убийств|
|rating|рейтинг|
|assists_per_round|число ассистов в расчете на 1 раунд|
|deaths_per_round|...|
|flash_assists_per_round|...|
|headshots_per_round|...|
|kills_per_round|...|

In [4]:
df_profile = pd.read_pickle('df_profile.pickle')

In [9]:
df_profile.head()

Unnamed: 0,id,date,timestamp,year,month,day,weekday,hour,number_of_games,map_id,...,team2__h1_win_count,team2__h2_win_count,team2__h1_outcome_eliminated_count,team2__h1_outcome_exploded_count,team2__h1_outcome_defused_count,team2__h2_outcome_eliminated_count,team2__h2_outcome_exploded_count,team2__h2_outcome_defused_count,team2__h1_outcome_timeout_count,team2__h2_outcome_timeout_count
0,1,2017-10-29 13:55:00+00:00,1509285000.0,2017,10,29,6,13,5,1,...,10,6,11,1.0,1.0,8.0,2.0,1.0,2.0,
1,2,2017-10-29 14:47:00+00:00,1509288000.0,2017,10,29,6,14,5,2,...,6,0,10,2.0,2.0,5.0,2.0,,1.0,
2,6,2017-10-29 09:30:00+00:00,1509269000.0,2017,10,29,6,9,3,6,...,5,9,11,3.0,1.0,11.0,2.0,1.0,,1.0
3,7,2017-10-29 10:30:00+00:00,1509273000.0,2017,10,29,6,10,3,1,...,10,6,13,,1.0,11.0,2.0,2.0,1.0,
4,8,2017-10-29 11:30:00+00:00,1509277000.0,2017,10,29,6,11,3,2,...,5,11,12,2.0,1.0,8.0,4.0,,,


In [81]:
L_p_id_keys = [
    'player1__id', 'player2__id', 'player3__id', 'player4__id',
    'player5__id', 'player6__id', 'player7__id', 'player8__id',
    'player9__id', 'player10__id'
]
L_t_id_keys = ['team1__id', 'team2__id']

cat_features = L_p_id_keys+L_t_id_keys

In [82]:
# del df_to_embed
# gc.collect()

In [83]:
df_to_embed = df_profile[cat_features].fillna('default').astype('str')

# длина комбинации
for n in tqdm.tqdm_notebook([2, 3]):
    # комбинации
    L_cmb = list(itertools.combinations(cat_features, n))
    for cmb in tqdm.tqdm_notebook(L_cmb, total = len(L_cmb)):
        cmb = list(cmb)
        new_key = '-'.join(cmb)
        df_to_embed[new_key] = df_to_embed[cmb].apply(lambda x: '-'.join(x), axis = 1)
cat_features = df_to_embed.columns

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

In [84]:
df_to_embed.head()

Unnamed: 0,player1__id,player2__id,player3__id,player4__id,player5__id,player6__id,player7__id,player8__id,player9__id,player10__id,...,player8__id-player9__id-player10__id,player8__id-player9__id-team1__id,player8__id-player9__id-team2__id,player8__id-player10__id-team1__id,player8__id-player10__id-team2__id,player8__id-team1__id-team2__id,player9__id-player10__id-team1__id,player9__id-player10__id-team2__id,player9__id-team1__id-team2__id,player10__id-team1__id-team2__id
0,17497,17498,17499,17500,17501,17502,17503,17504,17505,17506,...,17504-17505-17506,17504-17505-3207,17504-17505-3288,17504-17506-3207,17504-17506-3288,17504-3207-3288,17505-17506-3207,17505-17506-3288,17505-3207-3288,17506-3207-3288
1,17497,17498,17499,17500,17501,17502,17503,17504,17505,17506,...,17504-17505-17506,17504-17505-3207,17504-17505-3288,17504-17506-3207,17504-17506-3288,17504-3207-3288,17505-17506-3207,17505-17506-3288,17505-3207-3288,17506-3207-3288
2,17507,17508,17509,17510,17511,17512,17513,17514,17515,17516,...,17514-17515-17516,17514-17515-3209,17514-17515-3210,17514-17516-3209,17514-17516-3210,17514-3209-3210,17515-17516-3209,17515-17516-3210,17515-3209-3210,17516-3209-3210
3,17507,17508,17509,17510,17511,17512,17513,17514,17515,17516,...,17514-17515-17516,17514-17515-3209,17514-17515-3210,17514-17516-3209,17514-17516-3210,17514-3209-3210,17515-17516-3209,17515-17516-3210,17515-3209-3210,17516-3209-3210
4,17507,17508,17509,17510,17511,17512,17513,17514,17515,17516,...,17514-17515-17516,17514-17515-3209,17514-17515-3210,17514-17516-3209,17514-17516-3210,17514-3209-3210,17515-17516-3209,17515-17516-3210,17515-3209-3210,17516-3209-3210


In [87]:
class KerasEmbeddingTransformer():

    def __init__(self, min_n_dim, max_n_dim):
        self.min_n_dim = min_n_dim
        self.max_n_dim = max_n_dim
    def fit(self, df_to_embed):
        cat_features = df_to_embed.columns
        # размерность входного слоя
        self.inputs = []
        # эмбеддинги
        self.embeddings = []

        # признаки, для которых строим  эмбеддинги
        for col in tqdm.tqdm_notebook(cat_features):
            # мощность признака
            cardinality = int(np.ceil(df_to_embed[col].nunique() + 2))
            # задаем размерность ембеддинга
            # от 2, до 50, или cardinality//2
            embedding_dim = max(min((cardinality)//2, self.max_n_dim),self.min_n_dim)
            print(f'{col}: cardinality : {cardinality} and embedding dim: {embedding_dim}')
            col_inputs = Input(shape=(1,))
            # определяем ембеддинг
            embedding = Embedding(cardinality, embedding_dim,
                                  input_length=1, name=col+"__embed")(col_inputs)
            # регуляризуем (выбрасываем 10%)
            embedding = SpatialDropout1D(0.1)(embedding)
            # делаем плоским
            embedding = Reshape(target_shape=(embedding_dim,))(embedding)
            # добавляем размерность для входного слоя
            self.inputs.append(col_inputs)
            # добавляем слой с эмбеддингом
            self.embeddings.append(embedding)

        return self

    def transform(self):
        
        return self.inputs, self.embeddings

embed_transformer = KerasEmbeddingTransformer(min_n_dim=2, max_n_dim=50)
embed_transformer.fit(df_to_embed)
emb_inputs, emb_layer = embed_transformer.transform()

  0%|          | 0/298 [00:00<?, ?it/s]

player1__id: cardinality : 1029 and embedding dim: 50
player2__id: cardinality : 1284 and embedding dim: 50
player3__id: cardinality : 1412 and embedding dim: 50
player4__id: cardinality : 1453 and embedding dim: 50
player5__id: cardinality : 1490 and embedding dim: 50
player6__id: cardinality : 1276 and embedding dim: 50
player7__id: cardinality : 1603 and embedding dim: 50
player8__id: cardinality : 1755 and embedding dim: 50
player9__id: cardinality : 1859 and embedding dim: 50
player10__id: cardinality : 1890 and embedding dim: 50
team1__id: cardinality : 796 and embedding dim: 50
team2__id: cardinality : 1259 and embedding dim: 50
player1__id-player2__id: cardinality : 1846 and embedding dim: 50
player1__id-player3__id: cardinality : 2106 and embedding dim: 50
player1__id-player4__id: cardinality : 2232 and embedding dim: 50
player1__id-player5__id: cardinality : 2264 and embedding dim: 50
player1__id-player6__id: cardinality : 11664 and embedding dim: 50
player1__id-player7__id: 