In [1]:
import os, tqdm, json, pickle, gc, zipfile, itertools, time, collections
import pandas as pd
import numpy as np
from dateutil import parser
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout, BatchNormalization, LSTM, Flatten, Dot
from tensorflow.keras.optimizers import Adam, Adagrad, SGD
from tensorflow.python.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

class ResponsesTransformer():

    def __init__(self,fill_value_str, fill_value_num):
        self.FILL_VALUE_STR = fill_value_str
        self.FILL_VALUE_NUM = fill_value_num        

    def add_game_info(self, d_rsp):  

        d = {}         
        
        date = parser.parse(d_rsp['begin_at'])
        d['timestamp'] = date.timestamp()
        d['year'] = date.year
        d['month'] = date.month
        d['day'] = date.day
        d['weekday'] = date.weekday()
        d['hour'] = date.hour

        d['map_id'] = d_rsp['map']['id']

        d['league_id'] = d_rsp['match']['league_id']
        d['serie_id'] = d_rsp['match']['serie_id']
        d['tournament_id'] = d_rsp['match']['tournament_id']
        d['serie_tier'] = d_rsp['match']['serie']['tier']
        d['tournament_name'] = str.lower(d_rsp['match']['tournament']['name'])
        pp = d_rsp['match']['tournament']['prizepool']
        try:
            pp_size = np.log1p(int(pp.split(' ')[0]))
            pp_curr = str.lower(''.join(pp.split(' ')[1:]))
        except:
            pp_size = self.FILL_VALUE_NUM
            pp_curr = self.FILL_VALUE_STR 
        d['tournament_pp_size'] = pp_size
        d['tournament_pp_curr'] = pp_curr

        return d

    def get_team_round_info(self, d_rsp):

        df_rounds = pd.DataFrame.from_records(d_rsp['rounds'])
        df_rounds['outcome'].fillna(self.FILL_VALUE_STR, inplace= True)

        r_min, r_max=df_rounds['round'].min(), df_rounds['round'].max()

        if (r_min==1)&(r_max>=16):
            
            d= {}

            d['d_team_prefix2id'] = {
                'team1':df_rounds.query('round==1')['ct'].iloc[0],
                'team2':df_rounds.query('round==1')['terrorists'].iloc[0]
                }

            d['d_r_outcome'] = dict(zip(df_rounds['round'],df_rounds['outcome']))
            d['d_r_winner'] = dict(zip(df_rounds['round'],df_rounds['winner_team']))
            d['maxround'] = df_rounds['round'].max()
            d['winner_id'] = df_rounds['winner_team'].value_counts().idxmax()
            d['d_h1_win_count'] = df_rounds.query('round<=15')['winner_team'].value_counts().to_dict()
            d['d_h2_win_count'] = df_rounds.query('round>15')['winner_team'].value_counts().to_dict()
            d['d_h1_outcome_count'] = df_rounds.query('round<=15')['outcome'].value_counts().to_dict()
            d['d_h2_outcome_count'] = df_rounds.query('round>15')['outcome'].value_counts().to_dict()

            return d
    
    def get_player_stat_with_info(self, d_rsp):

        L_STAT_KEYS = [
            'adr', 'assists', 'deaths', 'first_kills_diff', 'flash_assists', 
            'headshots', 'k_d_diff', 'kast', 'kills', 'rating'
        ]

        L = []
        for p in d_rsp['players']:
            
            d = {}

            d['id'] = p['player']['id']
            
            if pd.notnull(p['player']['birth_year']):
                d['age'] = parser.parse(d_rsp['begin_at']).year - int(p['player']['birth_year'])
            else:
                d['age'] = self.FILL_VALUE_NUM

            d['hometown'] = str.lower(p['player']['hometown']).strip() if pd.notnull(p['player']['hometown']) else self.FILL_VALUE_STR
            d['nationality'] = str.lower(p['player']['nationality']).strip() if pd.notnull(p['player']['nationality']) else self.FILL_VALUE_STR

            d['team_id'] = p['team']['id']
            d['team_location'] = str.lower(p['team']['location']).strip() if pd.notnull(p['team']['location']) else self.FILL_VALUE_STR

            for key in L_STAT_KEYS:
                value = p[key]
                d[key]=value if pd.notnull(value) else 0.0

            L.append(d)

        df = pd.DataFrame.from_records(L)
        del L
        

        return df   
    def prepare_round_and_stat(self, d_team_round_info, df_player_info):
                
        d_team_prefix2id = d_team_round_info['d_team_prefix2id']
        d_team_id2prefix = {v:k for k, v in d_team_prefix2id.items()}

        df_player_info['team_prefix'] = df_player_info['team_id'].map(d_team_id2prefix)

        L = []
        for team_prefix, subdf in df_player_info.groupby('team_prefix'):
            subdf = subdf.sort_values('id')
            if len(subdf)==5:
                if team_prefix =='team1':
                    subdf['player_prefix'] = ['player1', 'player2', 'player3', 'player4', 'player5']
                else:
                    subdf['player_prefix'] = ['player6', 'player7', 'player8', 'player9', 'player10']
                L.append(subdf)

        df_stat = pd.concat(L)
        del L
        

        return df_stat 

    def build_data4game(self, d, d_team_round_info, df_round_and_stat):

        d_team_prefix2id = d_team_round_info['d_team_prefix2id']

        for k, v in d_team_prefix2id.items():
            d[f'{k}_id'] = v 

        d['team1_win'] = int(d_team_round_info['winner_id'] == d['team1_id'])
        d['team1_maxround'] = d_team_round_info['maxround']
        d['team1_h1_win_count'] = d_team_round_info['d_h1_win_count'][d['team1_id']]
        d['team1_h2_win_count'] = d_team_round_info['d_h2_win_count'][d['team1_id']]
        for k, v in d_team_round_info['d_h1_outcome_count'].items():
            d[f'team1_h1_{k}_count'] = v
        for k, v in d_team_round_info['d_h2_outcome_count'].items():
            d[f'team1_h2_{k}_count'] = v

        for k, v in d_team_round_info['d_r_outcome'].items():
            d[f'team1_r{k}_outcome'] = v
        for k, v in d_team_round_info['d_r_winner'].items():
            d[f'team1_r{k}_win'] = int(v==d['team1_id'])

        team1_id = d["team1_id"]
        d['team1_location'] = df_round_and_stat.query('team_id==@team1_id')['team_location'].iloc[0]
        d['team2_location'] = df_round_and_stat.query('team_id!=@team1_id')['team_location'].iloc[0]

        for i in range(len(df_round_and_stat)):
            row = df_round_and_stat.iloc[i]
            player_prefix = row['player_prefix']
            row = row.drop(['team_id', 'team_location', 'team_prefix', 'player_prefix'])
            for k, v in row.items():
                d[f'{player_prefix}_{k}'] = v

        return d

    def scale(self, x, out_range=(-1, 1)):
        domain = np.min(x), np.max(x)
        y = (x - (domain[1] + domain[0]) / 2) / (domain[1] - domain[0])
        return y * (out_range[1] - out_range[0]) + (out_range[1] + out_range[0]) / 2

    def get_responses(self, PATH_TO_RESPONSES):
        L_FILENAMES = os.listdir(PATH_TO_RESPONSES)
        L_RESPONSES = []
        for fnm in tqdm.tqdm(L_FILENAMES):
            try:
                pth = os.path.join(PATH_TO_RESPONSES, fnm)
                with open(pth, 'r') as f:
                    d_rsp = json.load(f)
                L_RESPONSES.append(d_rsp)
            except:
                pass
        idx_ordered = np.argsort([d_game['id'] for d_game in L_RESPONSES])[::-1]
        L_RESPONSES = np.array(L_RESPONSES)[idx_ordered].tolist()
        return L_RESPONSES

    def fit(self, PATH_TO_RESPONSES):
        
        # респонсы
        L_RESPONSES = self.get_responses(PATH_TO_RESPONSES)  

        # трансформация респонсов
        self.L_games = []
        for d_rsp in tqdm.tqdm(L_RESPONSES):
            try:
                d = self.add_game_info(d_rsp)
                d_team_round_info = self.get_team_round_info(d_rsp)
                df_player_info = self.get_player_stat_with_info(d_rsp)
                df_round_and_stat = self.prepare_round_and_stat(d_team_round_info, df_player_info)
                d_game = self.build_data4game(d, d_team_round_info, df_round_and_stat)
                self.L_games.append(d_game)
                del d, d_team_round_info, df_player_info, df_round_and_stat. d_game
            except:
                pass

        return self

    def transform(self):

        df_games = pd.DataFrame.from_records(self.L_games)
        del self.L_games
        gc.collect()

        obj_keys = df_games.select_dtypes('object').columns
        num_keys = df_games.drop(obj_keys, 1).columns

        df_games[obj_keys] = df_games[obj_keys].fillna(self.FILL_VALUE_STR)
        df_games[num_keys] = df_games[num_keys].fillna(self.FILL_VALUE_NUM)

        df_games = df_games.convert_dtypes()
        string_keys = df_games.select_dtypes('string').columns
        float_keys = df_games.select_dtypes('float').columns
        int_keys = df_games.select_dtypes('int').columns
        df_games[string_keys] = df_games[string_keys].astype('str')
        df_games[float_keys] = df_games[float_keys].astype('float')
        df_games[int_keys] = df_games[int_keys].astype('int')

        label_encoder = LabelEncoder()

        d_label_encoders = {}
        for key in tqdm.tqdm(['map_id', 'league_id', 'serie_id', 'tournament_id',
                            'serie_tier', 'tournament_name', 'tournament_pp_curr']):
            ser = df_games[key]
            label_encoder.fit(ser)
            d_label_encoders[key] = label_encoder
            df_games[key] = label_encoder.transform(ser)

        L_team_id_keys = ['team1_id', 'team2_id']
        label_encoder.fit(np.unique(df_games[L_team_id_keys].values.flatten()))
        d_label_encoders['team_id'] = label_encoder
        for key in L_team_id_keys:
            ser = df_games[key]
            df_games[key] = label_encoder.transform(ser)

        L_player_id_keys = [
            'player1_id', 'player2_id', 'player3_id', 'player4_id',
            'player5_id', 'player6_id', 'player7_id', 'player8_id', 'player9_id',
            'player10_id'
        ]
        label_encoder.fit(np.unique(df_games[L_player_id_keys].values.flatten()))
        d_label_encoders['player_id'] = label_encoder
        for key in L_player_id_keys:
            ser = df_games[key]
            df_games[key] = label_encoder.transform(ser)

        L_outcome_keys = df_games.columns[df_games.columns.str.contains('outcome')&~df_games.columns.str.contains('outcome_')]
        label_encoder.fit(np.unique(df_games[L_outcome_keys].values.flatten()))
        d_label_encoders['outcome'] = label_encoder
        for key in L_outcome_keys:
            ser = df_games[key]
            df_games[key] = label_encoder.transform(ser)


        L_loc_keys = ['team1_location', 'team2_location']
        label_encoder.fit(np.unique(df_games[L_loc_keys].values.flatten()))
        d_label_encoders['team_location'] = label_encoder
        for key in L_loc_keys:
            ser = df_games[key]
            df_games[key] = label_encoder.transform(ser)

        L_p_hometown_keys = ['player1_hometown', 
            'player2_hometown', 'player3_hometown',
            'player4_hometown',
            'player5_hometown',  'player6_hometown',
            'player7_hometown', 
            'player8_hometown', 'player9_hometown',
            'player10_hometown', ]
        label_encoder.fit(np.unique(df_games[L_p_hometown_keys].values.flatten()))
        d_label_encoders['player_hometown'] = label_encoder
        for key in L_p_hometown_keys:
            ser = df_games[key]
            df_games[key] = label_encoder.transform(ser)

        L_p_nationality_keys = ['player1_nationality', 
            'player2_nationality', 'player3_nationality',
            'player4_nationality',
            'player5_nationality',  'player6_nationality',
            'player7_nationality', 
            'player8_nationality', 'player9_nationality',
            'player10_nationality', ]
        label_encoder.fit(np.unique(df_games[L_p_nationality_keys].values.flatten()))
        d_label_encoders['player_nationality'] = label_encoder
        for key in L_p_nationality_keys:
            ser = df_games[key]
            df_games[key] = label_encoder.transform(ser)

        int_keys = df_games.select_dtypes('int').columns
        float_keys = df_games.select_dtypes('float').columns
        df_games[int_keys] = df_games[int_keys].astype(int)
        df_games[float_keys] = df_games[float_keys].astype(float) 
        df_games['timestamp'] = self.scale(df_games['timestamp'], (.1, .9))
        df_games = df_games.sort_values('timestamp').reset_index(drop = True)        

        self.d_label_encoders = d_label_encoders
        del d_label_encoders
        gc.collect()  

        return df_games

    def getLabelEncoders(self):
        return self.d_label_encoders    

def prepare_data4embeddings(df_games):
#   df_game_profile_v3 =pd.read_pickle('df_game_profile_v3.pickle')
  df = df_games.copy()

  id_list = []
  for i in df.columns:
    if i[-2:] == 'id':
      df[i] = df[i].astype(int)
      id_list.append(i)
  drop_list = []
  for i in df.columns:
    if i[:7] == 'team1_r':
      drop_list.append(i)
  df.drop(drop_list, axis=1, inplace = True)
  df = pd.get_dummies(data=df, columns=['map_id','serie_tier'])
  drop_list = ['year', 'month', 'day', 'weekday', 'hour', 'league_id', 'tournament_id', 'tournament_name', 'tournament_pp_size', 'tournament_pp_curr']
  df.drop(drop_list, axis=1, inplace = True)

  drop_list = ['team1_location', 'team2_location', 'serie_id',]
  df.drop(drop_list, axis=1, inplace = True)
  drop_list = ['player1_nationality', 'player2_nationality', 'player3_nationality', 'player4_nationality', 'player5_nationality']
  df.drop(drop_list, axis=1, inplace = True)
  drop_list = ['player6_nationality', 'player7_nationality', 'player8_nationality', 'player9_nationality', 'player10_nationality']
  df.drop(drop_list, axis=1, inplace = True)
  drop_list = ['player1_hometown', 'player2_hometown', 'player3_hometown', 'player4_hometown', 'player5_hometown']
  df.drop(drop_list, axis=1, inplace = True)
  drop_list = ['player6_hometown', 'player7_hometown', 'player8_hometown', 'player9_hometown', 'player10_hometown']
  df.drop(drop_list, axis=1, inplace = True)

  col_target = ['player1_adr', 'player1_assists', 'player1_deaths', 'player1_first_kills_diff', 'player1_flash_assists', 'player1_headshots', 'player1_kast', 'player1_kills', 'player1_rating']
  col_target += ['player2_adr', 'player2_assists', 'player2_deaths', 'player2_first_kills_diff', 'player2_flash_assists', 'player2_headshots', 'player2_kast', 'player2_kills', 'player2_rating']
  col_target += ['player3_adr', 'player3_assists', 'player3_deaths', 'player3_first_kills_diff', 'player3_flash_assists', 'player3_headshots', 'player3_kast', 'player3_kills', 'player3_rating']
  col_target += ['player4_adr', 'player4_assists', 'player4_deaths', 'player4_first_kills_diff', 'player4_flash_assists', 'player4_headshots', 'player4_kast', 'player4_kills', 'player4_rating']
  col_target += ['player5_adr', 'player5_assists', 'player5_deaths', 'player5_first_kills_diff', 'player5_flash_assists', 'player5_headshots', 'player5_kast', 'player5_kills', 'player5_rating']
  col_target += ['player6_adr', 'player6_assists', 'player6_deaths', 'player6_first_kills_diff', 'player6_flash_assists', 'player6_headshots', 'player6_kast', 'player6_kills', 'player6_rating']
  col_target += ['player7_adr', 'player7_assists', 'player7_deaths', 'player7_first_kills_diff', 'player7_flash_assists', 'player7_headshots', 'player7_kast', 'player7_kills', 'player7_rating']
  col_target += ['player8_adr', 'player8_assists', 'player8_deaths', 'player8_first_kills_diff', 'player8_flash_assists', 'player8_headshots', 'player8_kast', 'player8_kills', 'player8_rating']
  col_target += ['player9_adr', 'player9_assists', 'player9_deaths', 'player9_first_kills_diff', 'player9_flash_assists', 'player9_headshots', 'player9_kast', 'player9_kills', 'player9_rating']
  col_target += ['player10_adr', 'player10_assists', 'player10_deaths', 'player10_first_kills_diff', 'player10_flash_assists', 'player10_headshots', 'player10_kast', 'player10_kills', 'player10_rating']
  col_target += ['team1_h1_win_count', 'team1_h2_win_count']
  col_target += ['team1_h1_eliminated_count', 'team1_h1_defused_count', 'team1_h1_exploded_count', 'team1_h1_timeout_count']
  col_target += ['team1_h2_eliminated_count', 'team1_h2_defused_count', 'team1_h2_exploded_count', 'team1_h2_timeout_count']

  df_Y = df[col_target]
  

  df_X = df.drop(col_target, axis=1)  

  for i in range(len(df_Y['team1_h1_timeout_count'].values)):
    if df_Y['team1_h1_timeout_count'][i] == -100:
      df_Y['team1_h1_timeout_count'][i] = -1

  for i in range(len(df_Y['team1_h1_exploded_count'].values)):
    if df_Y['team1_h1_exploded_count'][i] == -100:
      df_Y['team1_h1_exploded_count'][i] = -1

  for i in range(len(df_Y['team1_h1_defused_count'].values)):
    if df_Y['team1_h1_defused_count'][i] == -100:
      df_Y['team1_h1_defused_count'][i] = -1

  for i in range(len(df_Y['team1_h1_eliminated_count'].values)):
    if df_Y['team1_h1_eliminated_count'][i] == -100:
      df_Y['team1_h1_eliminated_count'][i] = -1

  for i in range(len(df_Y['team1_h2_timeout_count'].values)):
    if df_Y['team1_h2_timeout_count'][i] == -100:
      df_Y['team1_h2_timeout_count'][i] = -1

  for i in range(len(df_Y['team1_h2_exploded_count'].values)):
    if df_Y['team1_h2_exploded_count'][i] == -100:
      df_Y['team1_h2_exploded_count'][i] = -1

  for i in range(len(df_Y['team1_h2_defused_count'].values)):
    if df_Y['team1_h2_defused_count'][i] == -100:
      df_Y['team1_h2_defused_count'][i] = -1

  for i in range(len(df_Y['team1_h2_eliminated_count'].values)):
    if df_Y['team1_h2_eliminated_count'][i] == -100:
      df_Y['team1_h2_eliminated_count'][i] = -1

  scaler = MinMaxScaler()
  cols_scal = ['player1_age', 'player2_age', 'player3_age', 'player4_age', 'player5_age', 'player6_age', 'player7_age', 'player8_age', 'player9_age', 'player10_age'] #, 'team1_maxround'
  df_X[cols_scal] = scaler.fit_transform(df_X[cols_scal])
  df_X[cols_scal] = df_X[cols_scal].astype('int')

  scaler = MinMaxScaler()
  cols_scal = ['team1_h1_exploded_count', 'team1_h1_defused_count', 'team1_h2_exploded_count', 'team1_h2_defused_count']
  df_Y[cols_scal] = scaler.fit_transform(df_Y[cols_scal])
  df_Y[cols_scal] = df_Y[cols_scal].astype('int')
  df_y = df_Y.astype(float)

  print('df_X shape: {}, df_Y shape: {}'.format(df_X.shape, df_Y.shape))

  return df_X, df_Y

### 1. подготовка респонсов 

In [2]:
# директория с респонсами
PATH_TO_RESPONSES = 'responses'

In [3]:
# # трансформация респонсов в таблицу с информацией об играх
# rt = ResponsesTransformer(fill_value_str='default', fill_value_num=-100)
# rt.fit(PATH_TO_RESPONSES)
# df_games = rt.transform()
df_games = pd.read_pickle('df_games_17052022.pickle')

In [4]:
df_games.head()

Unnamed: 0,timestamp,year,month,day,weekday,hour,map_id,league_id,serie_id,tournament_id,...,team1_r76_outcome,team1_r73_win,team1_r74_win,team1_r75_win,team1_r76_win,team1_h1_default_count,team1_r77_outcome,team1_r78_outcome,team1_r77_win,team1_r78_win
0,0.1,2016,1,13,2,12,0,10,38,55,...,0,-100,-100,-100,-100,-100,0,0,-100,-100
1,0.10001,2016,1,13,2,12,2,10,38,55,...,0,-100,-100,-100,-100,-100,0,0,-100,-100
2,0.100037,2016,1,13,2,14,2,10,38,55,...,0,-100,-100,-100,-100,-100,0,0,-100,-100
3,0.100049,2016,1,13,2,15,1,10,38,55,...,0,-100,-100,-100,-100,-100,0,0,-100,-100
4,0.10007,2016,1,13,2,17,2,10,38,55,...,0,-100,-100,-100,-100,-100,0,0,-100,-100


In [5]:
# # лейбл энкодеры
# rt.getLabelEncoders()

### 2. подготовка датасета 

In [6]:
df_X, df_Y = prepare_data4embeddings(df_games.sort_values('timestamp'))

PLAYERS_COUNT = len(set(list(df_games.player1_id.values) +\
                        list(df_games.player2_id.values) +\
                        list(df_games.player3_id.values) +\
                        list(df_games.player4_id.values) +\
                        list(df_games.player5_id.values) +\
                        list(df_games.player6_id.values) +\
                        list(df_games.player7_id.values) +\
                        list(df_games.player8_id.values) +\
                        list(df_games.player9_id.values) +\
                        list(df_games.player10_id.values)))
TEAMS_COUNT = len(set(list(df_games.team1_id.values) + list(df_games.team2_id.values)))

EMB_DIM = 64
PLAYERS_COUNT += 1
TEAMS_COUNT += 1
INPUT_SHAPE = 45

print('PLAYERS_COUNT = {}, TEAMS_COUNT = {}'.format(PLAYERS_COUNT, TEAMS_COUNT))

df_X shape: (37213, 57), df_Y shape: (37213, 100)
PLAYERS_COUNT = 4564, TEAMS_COUNT = 1326


In [7]:
df_X.head()

Unnamed: 0,timestamp,team1_id,team2_id,team1_win,team1_maxround,player1_id,player1_age,player1_k_d_diff,player2_id,player2_age,...,map_id_10,map_id_11,map_id_12,serie_tier_0,serie_tier_1,serie_tier_2,serie_tier_3,serie_tier_4,serie_tier_5,serie_tier_6
0,0.1,56,8,0,21,258,0,-7,259,0,...,0,0,0,0,0,0,0,1,0,0
1,0.10001,8,56,1,23,31,0,9,44,0,...,0,0,0,0,0,0,0,1,0,0
2,0.100037,18,2,1,25,3,0,14,4,0,...,0,0,0,0,0,0,0,1,0,0
3,0.100049,2,18,0,23,26,0,-4,29,0,...,0,0,0,0,0,0,0,1,0,0
4,0.10007,18,8,1,30,3,0,0,4,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
df_Y.head()

Unnamed: 0,player1_adr,player1_assists,player1_deaths,player1_first_kills_diff,player1_flash_assists,player1_headshots,player1_kast,player1_kills,player1_rating,player2_adr,...,team1_h1_win_count,team1_h2_win_count,team1_h1_eliminated_count,team1_h1_defused_count,team1_h1_exploded_count,team1_h1_timeout_count,team1_h2_eliminated_count,team1_h2_defused_count,team1_h2_exploded_count,team1_h2_timeout_count
0,62.8,0,17,-1,0,4,47.6,10,0.59,57.0,...,4,1,12,0,0,-1,3,0,0,-1
1,91.2,4,12,1,0,4,73.9,21,1.35,108.5,...,10,6,11,0,0,1,8,0,0,-1
2,97.7,1,14,3,0,7,76.0,28,1.59,78.9,...,8,8,12,0,0,-1,8,0,0,-1
3,68.9,4,18,2,0,8,65.2,14,0.85,72.3,...,5,2,11,0,0,-1,6,0,0,-1
4,79.5,1,20,5,0,4,70.0,20,1.11,68.3,...,8,8,11,0,0,1,12,0,0,-1


### 3. обучение эмбеддингов 

In [9]:
TEST_SIZE = .1
FEATURE_SHAPE= df_X.shape[1]

df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_Y, test_size = TEST_SIZE, shuffle = False)

In [10]:
print('X shape: train = {}, test = {}'.format(df_X_train.shape, df_X_test.shape))
print('Y shape: train = {}, test = {}'.format(df_y_train.shape, df_y_test.shape))

X shape: train = (33491, 57), test = (3722, 57)
Y shape: train = (33491, 100), test = (3722, 100)


In [11]:
p1_input = Input(shape = 1, name = 'p1')
p1_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p1_embedding')(p1_input)
p1_vec = Flatten(name='Fp1')(p1_emb)
p1_model = Model(inputs = p1_input, outputs = p1_vec)

p2_input = Input(shape = 1, name = 'p2')
p2_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p2_embedding')(p2_input)
p2_vec = Flatten(name='Fp2')(p2_emb)
p2_model = Model(inputs = p2_input, outputs = p2_vec)

p3_input = Input(shape = 1, name = 'p3')
p3_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p3_embedding')(p3_input)
p3_vec = Flatten(name='Fp3')(p3_emb)
p3_model = Model(inputs = p3_input, outputs = p3_vec)

p4_input = Input(shape = 1, name = 'p4')
p4_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p4_embedding')(p4_input)
p4_vec = Flatten(name='Fp4')(p4_emb)
p4_model = Model(inputs = p4_input, outputs = p4_vec)

p5_input = Input(shape = 1, name = 'p5')
p5_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p5_embedding')(p5_input)
p5_vec = Flatten(name='Fp5')(p5_emb)
p5_model = Model(inputs = p5_input, outputs = p5_vec)

p6_input = Input(shape = 1, name = 'p6')
p6_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p6_embedding')(p6_input)
p6_vec = Flatten(name='Fp6')(p6_emb)
p6_model = Model(inputs = p6_input, outputs = p6_vec)

p7_input = Input(shape = 1, name = 'p7')
p7_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p7_embedding')(p7_input)
p7_vec = Flatten(name='Fp7')(p7_emb)
p7_model = Model(inputs = p7_input, outputs = p7_vec)

p8_input = Input(shape = 1, name = 'p8')
p8_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p8_embedding')(p8_input)
p8_vec = Flatten(name='Fp8')(p8_emb)
p8_model = Model(inputs = p8_input, outputs = p8_vec)

p9_input = Input(shape = 1, name = 'p9')
p9_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p9_embedding')(p9_input)
p9_vec = Flatten(name='Fp9')(p9_emb)
p9_model = Model(inputs = p9_input, outputs = p9_vec)

p10_input = Input(shape = 1, name = 'p10')
p10_emb = Embedding(output_dim = EMB_DIM, input_dim = PLAYERS_COUNT, input_length = 1, name = 'p10_embedding')(p10_input)
p10_vec = Flatten(name='Fp10')(p10_emb)
p10_model = Model(inputs = p10_input, outputs = p10_vec)

t1_input = Input(shape = 1, name = 't1')
t1_emb = Embedding(output_dim = EMB_DIM, input_dim = TEAMS_COUNT, input_length = 1, name = 't1_embedding')(t1_input)
t1_vec = Flatten(name = 'Ft1')(t1_emb)
t1_model = Model(inputs = t1_input, outputs = t1_vec)

t2_input = Input(shape = 1, name = 't2')
t2_emb = Embedding(output_dim = EMB_DIM, input_dim = TEAMS_COUNT, input_length = 1, name = 't2_embedding')(t2_input)
t2_vec = Flatten(name = 'Ft2')(t2_emb)
t2_model = Model(inputs = t2_input, outputs = t2_vec)

feature_input = Input(shape = (INPUT_SHAPE), name = 'feature')

concate = Concatenate()([p1_vec, p2_vec, p3_vec, p4_vec, p5_vec, p6_vec, p7_vec, p8_vec, p9_vec, p10_vec,\
                         t1_vec, t2_vec,\
                         feature_input])

drop_col = ['player1_id', 'player2_id', 'player3_id', 'player4_id', 'player5_id', 'player6_id', 'player7_id', 'player8_id', 'player9_id', 'player10_id', 'team1_id', 'team2_id']
feature_Train = df_X_train.drop(drop_col, axis=1)
feature_Test = df_X_test.drop(drop_col, axis=1).astype(float)

InputTrain = (
    feature_Train,
    df_X_train['player1_id'],
    df_X_train['player2_id'],
    df_X_train['player3_id'],
    df_X_train['player4_id'],
    df_X_train['player5_id'],
    df_X_train['player6_id'],
    df_X_train['player7_id'],
    df_X_train['player8_id'],
    df_X_train['player9_id'],
    df_X_train['player10_id'],
    df_X_train['team1_id'],
    df_X_train['team2_id']
)
InputTest = (
    feature_Test,
    df_X_test['player1_id'],
    df_X_test['player2_id'],
    df_X_test['player3_id'],
    df_X_test['player4_id'],
    df_X_test['player5_id'],
    df_X_test['player6_id'],
    df_X_test['player7_id'],
    df_X_test['player8_id'],
    df_X_test['player9_id'],
    df_X_test['player10_id'],
    df_X_test['team1_id'],
    df_X_test['team2_id']
)

In [12]:
L_model = []
for lr in [1e-1, 1e-2, 1e-3]:
    concate_dropout = Dropout(0.3)(concate)
    dense = Dense(256, activation = 'leaky_relu')(concate_dropout)
    dense = Dense(128, activation = 'leaky_relu')(dense)
    dense = Dropout(0.3)(dense)
    dense = Dense(64, activation = 'leaky_relu')(concate_dropout)
    dense = Dense(32, activation = 'leaky_relu')(dense)
    output = Dense(df_y_train.shape[1], activation = 'leaky_relu', name = 'output')(dense)
    adam = Adam(lr=lr)
    model = Model([feature_input, p1_input, p2_input, p3_input, p4_input, p5_input, p6_input, p7_input, p8_input, p9_input, p10_input, t1_input, t2_input], output)
    model.compile(optimizer = adam, loss= 'mse', metrics = 'mse')
    L_model.append(model)

In [13]:
EPOCHS = 20
BATCH_SIZE = 64
CALLBACK = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

best_loss = np.inf
for model_i, model in enumerate(L_model):

    print('> ITER#{}/{}'.format(model_i+1, len(L_model)))


    model.fit(
        x = InputTrain, 
        y = df_y_train,
        validation_data = ([InputTest, df_y_test]),
        epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        verbose = 1,
        callbacks=[CALLBACK],
    )   

    loss = np.min(model.history.history['val_loss'])
    if loss<best_loss:
        best_loss = loss
        best_model = model    

    del model
    gc.collect()

    print('-----------------------------------------------------------------\n')


> ITER#1/3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
-----------------------------------------------------------------

> ITER#2/3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-----------------------------------------------------------------

> ITER#3/3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
-----------------------------------------------------------------



In [14]:
pred = best_model.predict(InputTest)

In [16]:
y_test = df_y_test.columns[-10]

prediction = []
tmp = []
for i in pred:
  prediction.append(round(i[-10]))
  tmp.append(i[-10])
y_test = df_y_test.team1_h1_win_count.values
for i in range(0, len(y_test), int(len(y_test)/9)):
  print("PRED:", prediction[i], "  -   TRUE:", round(y_test[i]))

PRED: -26   -   TRUE: 13
PRED: -29   -   TRUE: 10
PRED: -15   -   TRUE: 9
PRED: -20   -   TRUE: 6
PRED: -11   -   TRUE: 13
PRED: 27   -   TRUE: 12
PRED: -5   -   TRUE: 11
PRED: -73   -   TRUE: 12
PRED: -27   -   TRUE: 6
PRED: -17   -   TRUE: 10


In [19]:
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

# Output of metrics results
print('f1 score = ', metrics.f1_score(y_test.astype(int), prediction, average = 'weighted')) # samples
print('balanced accuracy score = ', metrics.balanced_accuracy_score(y_test.astype(int), prediction))
print('accuracy score = ', metrics.accuracy_score(y_test.astype(int), prediction))
# cm = metrics.confusion_matrix(y_test, prediction)

# def plotConfusionMatrix(cm):
#     fig = plt.figure(figsize=(10,10))
#     sns.heatmap(cm, annot=True, fmt="d")
#     plt.title('Confusion Matrix')
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     return None

# plotConfusionMatrix(cm)

f1 score =  0.002525327985534546
balanced accuracy score =  0.00806121572707635
accuracy score =  0.0013433637829124126
