# Import data
Training data

In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
from utils import get_train_data_only
df_teams, df_BLUE, df_RED = get_train_data_only()

In [2]:
#How many data points we have
len(df_teams), len(df_BLUE), len(df_RED)

(28254, 141270, 141270)

In [3]:
#Teams DataFrame used for Machine Learning
df_teams_ML = df_teams[['id', 'patch', 'year', 'winner']]
df_teams_ML.head()

Unnamed: 0,id,patch,year,winner
31563,LMS/2016 Season/Spring Season/Scoreboards_3_2,5.24,2015,BLUE
31596,LJL/2016 Season/Spring Season/Scoreboards_1_2,5.24,2015,RED
31595,LJL/2016 Season/Spring Season/Scoreboards_1_1,5.24,2015,RED
31588,CBLOL/2016 Season/Split 1/Scoreboards_2_2,5.24,2015,BLUE
31587,CBLOL/2016 Season/Split 1/Scoreboards_2_1,5.24,2015,RED


In [4]:
import numpy as np 
import pandas as pd
from tqdm import tqdm

# Feature Engineering

## Feature 1: Synergy between champions of the same team (rate of victory for a pair of champions playing together)

In [5]:
df_BLUE_lite_synergy = df_BLUE[['champion_id', 'win', 'game_id']]
df_RED_lite_synergy = df_RED[['champion_id', 'win', 'game_id']]

#Dosen't matter if it was the red or blue team so let's concat the data!
df_result = pd.concat([df_BLUE_lite_synergy, df_RED_lite_synergy])

#include a column for the outcome
df_result['outcome'] = df_result['win']*1

#reset the index to the game_id
df_result.index = df_result.game_id

In [6]:
# get the unique game_id and champion_id played
game_id_unique = np.unique(df_result.game_id)
champions_id_unique = np.unique(df_result.champion_id)

In [7]:
#create two dataframes
##one to populate the number of games played by each champion pair
champions_play_together = pd.DataFrame(np.zeros([len(champions_id_unique), len(champions_id_unique)]), columns=champions_id_unique, index=champions_id_unique)

##one to populate te number of games won by each champion pair
champions_won_together = pd.DataFrame(np.zeros([len(champions_id_unique), len(champions_id_unique)]), columns=champions_id_unique, index=champions_id_unique)

In [8]:
for champion_a in tqdm(champions_id_unique):
    champions_played_together_list = list(df_result.loc[df_result[df_result.champion_id == champion_a].index]['champion_id'])
    for champion_b in champions_played_together_list:
        if champion_a == champion_b:
            continue
        else:
            champions_play_together.loc[champion_a][champion_b] += 1

100%|██████████████████████████████████████████████████████████████| 152/152 [03:31<00:00,  1.39s/it]


In [9]:
for champ_a in tqdm(champions_id_unique):
    champ_played_together_won_list = list(df_result.loc[df_result[(df_result.champion_id == champ_a) & (df_result.outcome == 1)].index]['champion_id'])
    for champ_b in champ_played_together_won_list:
        if champ_a == champ_b:
            continue
        else:
            champions_won_together.loc[champ_a][champ_b] += 1

100%|██████████████████████████████████████████████████████████████| 152/152 [01:47<00:00,  1.42it/s]


In [10]:
champions_won_percentage = champions_won_together.div(champions_play_together)

In [11]:
#Impute the missing values (champions that never played together)
from sklearn.impute import SimpleImputer

impute_nan = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.5)
champions_won_percentage_imputed = pd.DataFrame(impute_nan.fit(champions_won_percentage).transform(champions_won_percentage), columns=champions_id_unique, index=champions_id_unique)
#champions_won_percentage_imputed

In [12]:
#check that there is no division by zero
np.isinf(champions_won_percentage_imputed).values.sum()

0

In [13]:
#check how many values have been imputed
np.isnan(champions_won_percentage_imputed).values.sum(), np.isnan(champions_won_percentage).values.sum()

(0, 1482)

In [14]:
"""This function is needed to transform the rest of the data"""

def pair_wise_synergy(df, df_synergy_matrix, name):
  #get the mean value of the pairwise combination of champions synergy
  result_Synergy = df[['game_id', 'champion_id']].groupby(['game_id']).aggregate({
      'champion_id': lambda z: [champions_won_percentage_imputed.loc[x][y] for x in z for y in z if x != y]
  })
  result_Synergy['mean_synergy_'+str(name)] = result_Synergy.champion_id.apply(lambda x: sum(x)/len(x))
  return result_Synergy.drop('champion_id', axis=1)

In [15]:
#Get the synergy of the Blue team champions

df_blue = pair_wise_synergy(df_BLUE, champions_won_percentage_imputed, 'blue')
df_blue['id'] = df_blue.index
df_teams_ML =pd.merge(df_teams_ML, df_blue, on='id', how='inner')

In [16]:
#Get the synergy of the Red team champions

df_red = pair_wise_synergy(df_RED, champions_won_percentage_imputed, 'red')
df_red['id'] = df_red.index
df_teams_ML = pd.merge(df_teams_ML, df_red, on='id', how='inner')

In [17]:
df_teams_ML.head()

Unnamed: 0,id,patch,year,winner,mean_synergy_blue,mean_synergy_red
0,LMS/2016 Season/Spring Season/Scoreboards_3_2,5.24,2015,BLUE,0.502761,0.480432
1,LJL/2016 Season/Spring Season/Scoreboards_1_2,5.24,2015,RED,0.498885,0.508034
2,LJL/2016 Season/Spring Season/Scoreboards_1_1,5.24,2015,RED,0.489649,0.517847
3,CBLOL/2016 Season/Split 1/Scoreboards_2_2,5.24,2015,BLUE,0.498627,0.503166
4,CBLOL/2016 Season/Split 1/Scoreboards_2_1,5.24,2015,RED,0.481985,0.486419


## Feature 2: Win rate of champions against the same ROLE of the other team's champion

In [18]:
df_BLUE_RED = pd.merge(left=df_BLUE, right=df_RED, left_on= 'game_id', right_on= 'game_id')
df_role = df_BLUE_RED[['champion_id_x', 'role_x', 'role_y', 'champion_id_y', 'win_x', 'game_id']]

In [19]:
#times that a given champion won or lost against another champion
champion_vs_champion = pd.DataFrame(df_role[['champion_id_x', 'role_x', 'role_y', 'champion_id_y', 'win_x']].value_counts())
champion_vs_champion

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0
champion_id_x,role_x,role_y,champion_id_y,win_x,Unnamed: 5_level_1
12,SUP,SUP,201,True,372
201,SUP,SUP,12,True,337
412,SUP,SUP,201,True,318
201,SUP,SUP,12,False,317
12,SUP,SUP,201,False,307
...,...,...,...,...,...
84,MID,SUP,1,True,1
84,MID,SUP,9,True,1
84,MID,SUP,16,True,1
84,MID,SUP,22,False,1


In [20]:
#times that a given champion played against another champion by role
total_champion_vs_champion = pd.DataFrame(df_BLUE_RED[['champion_id_x', 'role_x', 'role_y', 'champion_id_y']].value_counts())
total_champion_vs_champion

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0
champion_id_x,role_x,role_y,champion_id_y,Unnamed: 4_level_1
12,SUP,SUP,201,679
201,SUP,SUP,12,654
201,SUP,SUP,412,574
412,SUP,SUP,201,555
201,SUP,BOT,81,522
...,...,...,...,...
114,TOP,BOT,63,1
58,MID,MID,91,1
58,MID,MID,61,1
163,MID,TOP,38,1


In [21]:
#percentage of times that a champion has lost or won against another champion
rate_champion_vs_champion = champion_vs_champion.div(total_champion_vs_champion)
rate_champion_vs_champion

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,0
champion_id_x,role_x,role_y,champion_id_y,win_x,Unnamed: 5_level_1
1,MID,BOT,21,False,1.0
1,MID,BOT,51,False,1.0
1,MID,BOT,235,True,1.0
1,MID,BOT,429,False,1.0
1,MID,BOT,498,False,0.5
...,...,...,...,...,...
876,TOP,SUP,111,True,1.0
876,TOP,SUP,432,False,1.0
876,TOP,TOP,58,False,1.0
876,TOP,TOP,126,False,0.5


In [22]:
global rate_role
rate_role = rate_champion_vs_champion

In [23]:
def get_vs_rate(id_x, role_x, role_y, id_y):
    try:
      x = rate_champion_vs_champion.loc[id_x, role_x, role_y, id_y, True]
    except KeyError:
        try:
          x = 1-rate_champion_vs_champion.loc[id_x, role_x, role_y, id_y, False]
        except KeyError:
          x = 0.5
    return x

In [24]:
df_same_role = df_role[df_role['role_x'] == df_role['role_y']]
df_same_role['same_role_win_rate'] = df_same_role.apply(lambda z: 
              get_vs_rate(z.champion_id_x, z.role_x, z.role_y, z.champion_id_y),
              axis=1)
df_same_role

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_same_role['same_role_win_rate'] = df_same_role.apply(lambda z:


Unnamed: 0,champion_id_x,role_x,role_y,champion_id_y,win_x,game_id,same_role_win_rate
0,13,TOP,TOP,54,True,LMS/2016 Season/Spring Season/Scoreboards_3_2,1.000000
6,421,JGL,JGL,203,True,LMS/2016 Season/Spring Season/Scoreboards_3_2,0.547170
12,81,MID,MID,38,True,LMS/2016 Season/Spring Season/Scoreboards_3_2,1.000000
18,429,BOT,BOT,18,True,LMS/2016 Season/Spring Season/Scoreboards_3_2,0.507692
24,412,SUP,SUP,12,True,LMS/2016 Season/Spring Season/Scoreboards_3_2,0.510288
...,...,...,...,...,...,...,...
706325,516,TOP,TOP,875,False,LCL/2020 Season/Spring Season/Scoreboards_2_1,0.519481
706331,59,JGL,JGL,64,False,LCL/2020 Season/Spring Season/Scoreboards_2_1,0.474654
706337,142,MID,MID,131,False,LCL/2020 Season/Spring Season/Scoreboards_2_1,0.166667
706343,81,BOT,BOT,523,False,LCL/2020 Season/Spring Season/Scoreboards_2_1,0.528226


In [25]:
roles = ['TOP', 'JGL', 'MID', 'BOT', 'SUP']

for role in roles:
  df_role = df_same_role[df_same_role.role_x == role][['game_id', 'same_role_win_rate']]
  df_role[role] = df_role['same_role_win_rate']
  df_role['id'] = df_role['game_id']
  df_role.drop(['game_id', 'same_role_win_rate'], axis=1, inplace=True)
  df_teams_ML = pd.merge(df_teams_ML, df_role, on='id', how='inner')

df_teams_ML

Unnamed: 0,id,patch,year,winner,mean_synergy_blue,mean_synergy_red,TOP,JGL,MID,BOT,SUP
0,LMS/2016 Season/Spring Season/Scoreboards_3_2,5.24,2015,BLUE,0.502761,0.480432,1.000000,0.547170,1.000000,0.507692,0.510288
1,LJL/2016 Season/Spring Season/Scoreboards_1_2,5.24,2015,RED,0.498885,0.508034,0.444444,0.579832,0.600000,0.250000,0.370370
2,LJL/2016 Season/Spring Season/Scoreboards_1_1,5.24,2015,RED,0.489649,0.517847,0.500000,0.463158,0.500000,0.477551,0.615385
3,CBLOL/2016 Season/Split 1/Scoreboards_2_2,5.24,2015,BLUE,0.498627,0.503166,1.000000,0.444444,0.409091,0.508475,0.400000
4,CBLOL/2016 Season/Split 1/Scoreboards_2_1,5.24,2015,RED,0.481985,0.486419,0.000000,0.527108,0.333333,0.200000,0.515789
...,...,...,...,...,...,...,...,...,...,...,...
28249,VCS/2020 Season/Spring Season/Scoreboards/Week...,10.30,2020,BLUE,0.489887,0.494892,0.437500,0.508772,0.833333,0.553846,0.547253
28250,VCS/2020 Season/Spring Season/Scoreboards/Week...,10.30,2020,RED,0.500072,0.494267,0.531915,0.544000,0.472222,0.583333,0.515789
28251,VCS/2020 Season/Spring Season/Scoreboards/Week...,10.30,2020,RED,0.508416,0.496037,0.333333,0.557769,0.555556,0.600000,0.409091
28252,LCK/2020 Season/Spring Season/Scoreboards/Week...,10.30,2020,BLUE,0.513824,0.482004,0.531915,0.555556,0.596330,0.488889,0.516129


## Feature 3: Simple averages

# Machine Learning

## Pipe with logit regression

In [26]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn import set_config; set_config(display='diagram')


# Impute then Scale for numerical variables: 
num_transformer = make_pipeline(
                    SimpleImputer(strategy = 'mean'),
                    MinMaxScaler())

# Encode categorical variables
cat_transformer = OneHotEncoder(sparse = False)#, handle_unknown='ignore')

# Paralellize "num_transformer" and "One hot encoder"
preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough')

#add model
pipe = make_pipeline(preproc, LogisticRegression(solver='liblinear'))
pipe

In [27]:
from sklearn.preprocessing import LabelEncoder

y_train = LabelEncoder().fit(df_teams_ML.winner).transform(df_teams_ML.winner)
X_train = df_teams_ML.drop(['id', 'winner'], axis=1)

In [28]:
# Train pipeline
pipe.fit(X_train,y_train)

In [29]:
from sklearn.model_selection import cross_val_score

# Cross validate pipeline
cross_val_score(pipe, X_train, y_train, cv=20, scoring='accuracy').mean()

0.7431208887926558

In [37]:
test_teams, test_blue, test_red = get_train_data_only(test_data = True)
len(test_teams), len(test_blue), len(test_red)

TypeError: get_train_data_only() got an unexpected keyword argument 'test_data'