# Import data

In [2]:
import numpy as np
import pandas as pd
import json

In [3]:
file = '../raw_data/games_dump.json'
with open(file) as data_file:    
    data = json.load(data_file) 

# Normalize the JSON file

Normalize means to expand the nested keys in the JSON file and the subsequent DataFrame.

In [4]:
#Normalize the complete dataframe and make a copy
df_normalized_teams = pd.json_normalize(data)
df_teams = df_normalized_teams.copy()

In [5]:
#Make a dataframe dedicated to the teams 
df_teams.drop(['teams.BLUE.players', 'teams.RED.players', 'picks_bans'], axis=1, inplace=True)
df_teams.tail(2)

Unnamed: 0,id,start,patch,winner,duration,video,teams.BLUE.name,teams.BLUE.total_turret_kills,teams.BLUE.total_inhibitor_kills,teams.BLUE.total_rift_herald_kills,teams.BLUE.total_dragon_kills,teams.BLUE.total_baron_kills,teams.RED.name,teams.RED.total_turret_kills,teams.RED.total_inhibitor_kills,teams.RED.total_rift_herald_kills,teams.RED.total_dragon_kills,teams.RED.total_baron_kills
18252,19524,2021-11-07T17:51:39+00:00,11.21,RED,2053.0,,Inside Games,7.0,2.0,2.0,1.0,0.0,STOPWATCH eSports (Czech Team),9.0,1.0,0.0,4.0,2.0
18253,19525,2021-11-07T18:54:44+00:00,11.21,BLUE,2048.0,,Inaequalis Academy,8.0,1.0,1.0,4.0,2.0,Dynamo Eclot,6.0,0.0,1.0,1.0,0.0


In [6]:
#Make a dataframe dedicated to the team BLUE
df_normalized_teams['teams.BLUE.players'].explode()
df_normalized_BLUE = pd.json_normalize(df_normalized_teams['teams.BLUE.players'].explode())
df_normalized_BLUE.sample(2)

Unnamed: 0,name,champion_name,champion_id,gd_15,kills_assists_15,deaths_15,gold_share_post_15,damage_share,jungle_proximity,support_proximity,...,total_monster_kills,total_assists,total_deaths,total_vision_score,total_damage_taken,total_damage_dealt,trueskill_sigma,trueskill_mu,win,side
75527,Nido,Viktor,112,239.0,1.0,0.0,,0.309516,0.153846,0.076923,...,9.0,3.0,5.0,22.0,13066.0,174411.0,6.172628,28.552753,False,BLUE
17676,Karsa,Ekko,245,,,,,,,,...,73.0,0.0,3.0,0.0,24676.0,111209.0,4.793515,27.817822,False,BLUE


In [7]:
#Make a dataframe dedicated to the team RED
df_normalized_teams['teams.RED.players'].explode()
df_normalized_RED = pd.json_normalize(df_normalized_teams['teams.RED.players'].explode())
df_normalized_RED.sample(2)

Unnamed: 0,name,champion_name,champion_id,gd_15,kills_assists_15,deaths_15,gold_share_post_15,damage_share,jungle_proximity,support_proximity,...,total_monster_kills,total_assists,total_deaths,total_vision_score,total_damage_taken,total_damage_dealt,trueskill_sigma,trueskill_mu,win,side
73591,Haro,Hecarim,120,1515.0,5.0,0.0,,0.189586,1.0,0.083333,...,189.0,12.0,6.0,48.0,43113.0,210011.0,5.826556,16.879253,False,RED
80232,Xaky,Viego,234,106.0,2.0,1.0,,0.184742,0.321201,0.032757,...,8.0,8.0,2.0,39.0,14038.0,182442.0,5.497606,32.4469,True,RED


##  Include the game id to match later different DataFrames

We have two different dataframes for team Blue and team Red, later, we can merge them on the game id. This will also be useful when we want to come back for the y parameter! 

In [8]:
get_index = df_normalized_teams['id'].tolist()
index_preproc = np.asarray([[index] * 5 for index in get_index])
index_teams = index_preproc.reshape(len(df_normalized_teams) * 5).tolist()
df_normalized_RED['game_id'] = index_teams
df_normalized_BLUE['game_id'] = index_teams

# Filter years that are going to be analyzed

In [19]:
# Get the years and the number of games played
df_normalized_teams['start'] = pd.to_datetime(df_normalized_teams['start'])
df_normalized_teams['year'] = pd.DatetimeIndex(df_normalized_teams['start']).year
games_year = df_normalized_teams.year.unique()

for year in games_year:
    N_games = len(df_normalized_teams[df_normalized_teams['year'] == year])
    print(f'Number of games in {year}: {N_games}')

Number of games in 2011: 28
Number of games in 2012: 428
Number of games in 2013: 1303
Number of games in 2021: 14003
Number of games in 2015: 1544
Number of games in 2016: 204
Number of games in 2014: 744


In [20]:
# Function that returns a Dataframe just with the years of interest
def choose_years_to_work(df, *args):
    years_dict = {}
    for year in args:
        years_dict[year] = df[df['year'] == year]
    return pd.concat(years_dict).droplevel(0)

df_data_by_year = choose_years_to_work(df_normalized_teams, 2021)
df_data_by_year.drop('start', axis=1, inplace=True)

Unnamed: 0,id,patch,winner,duration,video,picks_bans,teams.BLUE.name,teams.BLUE.total_turret_kills,teams.BLUE.total_inhibitor_kills,teams.BLUE.total_rift_herald_kills,...,teams.BLUE.total_baron_kills,teams.BLUE.players,teams.RED.name,teams.RED.total_turret_kills,teams.RED.total_inhibitor_kills,teams.RED.total_rift_herald_kills,teams.RED.total_dragon_kills,teams.RED.total_baron_kills,teams.RED.players,year
22,19158,11.19,BLUE,,,[],Zephyr Esport Red,,,,...,,"[{'name': None, 'champion_name': 'Jayce', 'cha...",عبعال والصحبة,,,,,,"[{'name': None, 'champion_name': 'Ornn', 'cham...",2021
135,19162,11.19,BLUE,,,[],Zephyr Esport Red,,,,...,,"[{'name': None, 'champion_name': 'Jayce', 'cha...",عبعال والصحبة,,,,,,"[{'name': None, 'champion_name': 'Ornn', 'cham...",2021
161,19166,11.19,BLUE,,,[],Zephyr Esport Red,,,,...,,"[{'name': None, 'champion_name': 'Jayce', 'cha...",عبعال والصحبة,,,,,,"[{'name': None, 'champion_name': 'Ornn', 'cham...",2021
225,19170,11.19,BLUE,,,[],Zephyr Esport Red,,,,...,,"[{'name': None, 'champion_name': 'Jayce', 'cha...",عبعال والصحبة,,,,,,"[{'name': None, 'champion_name': 'Ornn', 'cham...",2021
238,1297,11.1,RED,1666.0,https://youtu.be/5UwuU3wLrfs?t=6,[],Rejects Gaming,2.0,0.0,2.0,...,0.0,"[{'name': 'RJX ZaFiR', 'champion_name': 'Thres...",CowBoySquad Imperials Esports,10.0,2.0,0.0,4.0,1.0,"[{'name': 'CBI SneakyLemon', 'champion_name': ...",2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18249,19521,11.21,BLUE,1576.0,,"[{'ban': True, 'champion_id': 0, 'champion_nam...",Cryptova,10.0,1.0,2.0,...,1.0,"[{'name': 'Tasaa', 'champion_name': 'Mordekais...",Inside Games Challengers,1.0,0.0,0.0,2.0,0.0,"[{'name': 'Dejvos', 'champion_name': 'Nautilus...",2021
18250,19522,11.21,BLUE,2299.0,,"[{'ban': True, 'champion_id': 0, 'champion_nam...",HEET,9.0,1.0,2.0,...,1.0,"[{'name': 'Wapode', 'champion_name': 'Ezreal',...",Dark Tigers,3.0,0.0,0.0,2.0,1.0,"[{'name': 'Pepi (Miro Rauten)', 'champion_name...",2021
18251,19523,11.21,RED,1467.0,,"[{'ban': True, 'champion_id': 0, 'champion_nam...",Inaequalis,5.0,0.0,1.0,...,0.0,"[{'name': 'Hachi (Davy de Graaf)', 'champion_n...",,8.0,1.0,1.0,2.0,1.0,"[{'name': 'Welcom', 'champion_name': 'Gwen', '...",2021
18252,19524,11.21,RED,2053.0,,"[{'ban': True, 'champion_id': 0, 'champion_nam...",Inside Games,7.0,2.0,2.0,...,0.0,"[{'name': 'Trungi', 'champion_name': 'Camille'...",STOPWATCH eSports (Czech Team),9.0,1.0,0.0,4.0,2.0,"[{'name': 'Deffaren', 'champion_name': 'Braum'...",2021


# Retrieve the champions stats given a champion id and merge with the BLUE/RED team

From the page https://ddragon.leagueoflegends.com/cdn/12.3.1/data/en_US/champion.json, get the stats of a champion given a key or id value.

In [9]:
champion_description = '../raw_data/lol_12_3_1.json'

with open(champion_description) as data_file:    
    data_champions = json.load(data_file)

In [10]:
# get the key and the stats for every champion in this version
# create a new dictionary with only the values of interest

champions_dict = {}
champions_names = list(data_champions['data'].keys())
for champion in champions_names:
    champions_dict[int(data_champions['data'][champion]['key'])] = data_champions['data'][champion]['stats']
champions_df = pd.DataFrame(champions_dict)
champions_df = champions_df.T
champions_df.reset_index(level=0, inplace=True)
champions_df['champion'] = champions_df['index']
champions_df.drop('index', axis=1, inplace=True)
champions_df

Unnamed: 0,hp,hpperlevel,mp,mpperlevel,movespeed,armor,armorperlevel,spellblock,spellblockperlevel,attackrange,...,hpregenperlevel,mpregen,mpregenperlevel,crit,critperlevel,attackdamage,attackdamageperlevel,attackspeedperlevel,attackspeed,champion
0,580.0,90.0,0.0,0.0,345.0,38.0,3.25,32.0,1.25,175.0,...,1.00,0.000,0.00,0.0,0.0,60.0,5.00,2.500,0.651,266
1,500.0,82.0,418.0,25.0,330.0,18.0,3.50,30.0,0.50,550.0,...,0.60,8.000,0.80,0.0,0.0,53.0,3.00,2.000,0.668,103
2,500.0,105.0,200.0,0.0,345.0,23.0,3.50,37.0,1.25,125.0,...,0.90,50.000,0.00,0.0,0.0,62.0,3.30,3.200,0.625,84
3,560.0,90.0,350.0,40.0,330.0,26.0,3.00,30.0,0.50,500.0,...,0.65,8.175,0.70,0.0,0.0,52.0,3.50,4.000,0.638,166
4,600.0,106.0,350.0,40.0,330.0,44.0,3.50,32.0,1.25,125.0,...,0.85,8.500,0.80,0.0,0.0,62.0,3.75,2.125,0.625,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,500.0,85.0,250.0,45.0,330.0,23.0,3.50,30.0,0.50,500.0,...,0.55,6.000,0.80,0.0,0.0,58.0,2.00,2.000,0.658,221
154,536.0,92.0,480.0,23.5,325.0,22.0,3.30,30.0,0.50,550.0,...,0.60,8.000,0.80,0.0,0.0,54.0,3.10,2.000,0.656,115
155,504.0,82.0,452.0,50.0,335.0,24.0,3.80,30.0,0.50,550.0,...,0.50,11.340,0.80,0.0,0.0,52.0,3.00,2.130,0.625,26
156,560.0,92.0,425.0,25.0,340.0,21.0,3.50,30.0,0.50,550.0,...,0.60,8.000,0.65,0.0,0.0,58.0,3.30,2.500,0.625,142


In [11]:
#Merge with BLUE team and drop the id of the champion, no needed anymore
df_BLUE = pd.merge(df_normalized_BLUE, champions_df, left_on='champion_id', right_on='champion', how='left')
df_BLUE.drop(['champion', 'champion_id'], axis=1, inplace=True)

In [12]:
#Merge with RED team and drop the id of the champion, no needed anymore
df_RED = pd.merge(df_normalized_RED, champions_df, left_on='champion_id', right_on='champion', how='left')
df_RED.drop(['champion', 'champion_id'], axis=1, inplace=True)

# Use SimpleImputer for unknown values

In [None]:
# Still gonna copy paste this cell

# Feature Engineering (?)

The idea is to sum the champion stats plus the features for the five players in a game. 

In [16]:
results_BLUE = df_BLUE.groupby('game_id').sum()
results_BLUE.sample(5)

Unnamed: 0_level_0,gd_15,kills_assists_15,deaths_15,gold_share_post_15,damage_share,jungle_proximity,support_proximity,total_gold,total_cs,total_kills,...,hpregen,hpregenperlevel,mpregen,mpregenperlevel,crit,critperlevel,attackdamage,attackdamageperlevel,attackspeedperlevel,attackspeed
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10980,3399.0,12.0,11.0,0.0,1.0,1.307692,1.692308,60902.0,1019.0,17.0,...,39.5,3.65,39.76,3.35,0.0,0.0,310.0,16.9,13.7,3.304
1622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.5,3.3,35.2,3.55,0.0,0.0,305.0,16.7,11.05,3.056
5650,-7648.0,5.0,10.0,0.0,1.0,1.307692,1.769231,33679.0,716.0,2.0,...,31.0,3.15,38.2,3.35,0.0,0.0,288.0,14.8,11.35,3.272
12841,-1249.0,6.0,3.0,0.0,1.0,1.674013,2.073462,59732.0,1035.0,15.0,...,33.75,3.9,69.0,2.4,0.0,0.0,301.0,18.6,10.8,3.233
7538,-987.0,9.0,7.0,0.0,1.0,1.333333,1.75,70018.0,1228.0,14.0,...,33.75,3.6,32.38,2.7,0.0,0.0,312.0,18.55,12.432,3.279


In [17]:
results_RED = df_RED.groupby('game_id').sum()
results_RED.sample(5)

Unnamed: 0_level_0,gd_15,kills_assists_15,deaths_15,gold_share_post_15,damage_share,jungle_proximity,support_proximity,total_gold,total_cs,total_kills,...,hpregen,hpregenperlevel,mpregen,mpregenperlevel,crit,critperlevel,attackdamage,attackdamageperlevel,attackspeedperlevel,attackspeed
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27.5,3.15,46.0,2.7,0.0,0.0,293.0,16.1,11.37,3.252
9170,-2680.0,12.0,7.0,0.0,1.0,1.540938,2.040478,39573.0,809.0,5.0,...,29.75,3.15,21.38,1.5,0.0,0.0,294.0,15.3,11.15,3.146
9909,1261.0,24.0,2.0,0.0,1.0,2.153846,2.538462,49858.0,817.0,16.0,...,34.5,3.0,26.75,2.75,0.0,0.0,291.0,15.7,11.15,3.125
462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.5,3.85,81.25,2.75,0.0,0.0,299.0,15.75,11.875,3.251
1664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,31.5,3.1,43.7,3.45,0.0,0.0,254.0,14.2,11.785,3.177


# MinMax Scaler

# LabelEncode

# Features to drop (remove)

In [None]:
#df_teams.drop(['video','teams.BLUE.name', 'teams.RED.name'], axis=1, inplace = True)