In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 

## Data

In [88]:
df = pd.read_csv('../output/csv/lol-data-matches-fixed-duration.csv').drop(columns=['Unnamed: 0'])
df1 = pd.read_csv('../output/csv/lol-data-match-frames.csv').drop(columns=['Unnamed: 0'])
blue, red = -1, 1
frame = 15
random_state = 0
df1 = df1[df1['frame'] == frame]
print('Number of duplicate data points in lol-data-matches-fixed-duration.csv: '
     + str(df.duplicated(subset=['match_id']).sum()))
print('Number of duplicate data points in lol-data-match-frames.csv: '
     + str(df1.duplicated(subset=['match_id']).sum()))
df.drop_duplicates(subset=['match_id'], inplace=True)
df1.drop_duplicates(subset=['match_id'], inplace=True)
df = pd.merge(df1, df, on='match_id')
print('Number of data points with missing features: ')
for f in df.columns[df.isna().any()].tolist():
    print(f + ': ' + str(df.loc[df[f].isna()].shape[0]))
df.dropna(subset=['winning_team'], inplace=True)
print('Initial set of features:\n', df.columns.tolist())
print('Initial class distribution:\nBlue:' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[0]/df.shape[0])*100)
      + '\nRed:' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[1]/df.shape[0])*100))
print('Initial tier distribution: \nBronze: ', df.groupby('tier').size().tolist()[0],
     '\nGold:', df.groupby('tier').size().tolist()[1], '\nDiamond:', df.groupby('tier').size().tolist()[2],
     '\nGrandmaster:', df.groupby('tier').size().tolist()[3])
df.describe()

Number of duplicate data points in lol-data-matches-fixed-duration.csv: 80
Number of duplicate data points in lol-data-match-frames.csv: 0
Number of data points with missing features: 
winning_team: 2
first_champion: 119
first_tower: 23
first_inhibitor: 4597
first_baron: 11269
first_dragon: 80
first_rift_herald: 2490
Initial set of features:
 ['match_id', 'frame', 'blue_total_kills', 'blue_total_gold', 'blue_total_cs', 'blue_total_damage', 'blue_towers', 'blue_plates', 'blue_inhibitors', 'blue_barons', 'blue_dragons', 'blue_rift_heralds', 'red_total_kills', 'red_total_gold', 'red_total_cs', 'red_total_damage', 'red_towers', 'red_plates', 'red_inhibitors', 'red_barons', 'red_dragons', 'red_rift_heralds', 'tier', 'division', 'patch', 'game_duration', 'region', 'winning_team', 'first_champion', 'first_tower', 'first_inhibitor', 'first_baron', 'first_dragon', 'first_rift_herald']
Initial class distribution:
Blue:50.57%
Red:49.43%
Initial tier distribution: 
Bronze:  10984 
Gold: 9563 
Diam

Unnamed: 0,frame,blue_total_kills,blue_total_gold,blue_total_cs,blue_total_damage,blue_towers,blue_plates,blue_inhibitors,blue_barons,blue_dragons,...,red_rift_heralds,patch,game_duration,winning_team,first_champion,first_tower,first_inhibitor,first_baron,first_dragon,first_rift_herald
count,35560.0,35560.0,35560.0,35560.0,35560.0,35560.0,35560.0,35560.0,35560.0,35560.0,...,35560.0,35560.0,35560.0,35560.0,35441.0,35537.0,30963.0,24292.0,35480.0,33070.0
mean,15.0,11.056412,26295.0027,391.234674,26365.56018,0.479556,6.009308,0.00149,0.0,0.646682,...,0.336474,11.231027,1724.085742,149.431946,149.346802,148.718237,148.796951,151.473736,151.834837,146.36831
std,0.0,4.464712,3143.540154,55.201427,5359.20093,0.777889,3.365281,0.043382,0.0,0.672691,...,0.47251,0.175847,402.616937,49.997476,49.996438,49.984271,49.986332,49.979305,49.967027,49.868688
min,15.0,0.0,14651.0,145.0,7603.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.25,840.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
25%,15.0,8.0,24125.0,354.0,22666.0,0.0,3.0,0.0,0.0,0.0,...,0.0,11.18,1436.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
50%,15.0,11.0,26158.0,396.0,26114.0,0.0,6.0,0.0,0.0,1.0,...,0.0,11.21,1702.0,100.0,100.0,100.0,100.0,200.0,200.0,100.0
75%,15.0,14.0,28291.0,431.0,29778.0,1.0,8.0,0.0,0.0,1.0,...,1.0,11.21,1979.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
max,15.0,36.0,40225.0,599.0,57894.0,9.0,15.0,3.0,0.0,2.0,...,1.0,11.9,3649.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0


## Data Preprocessing

### Sampling

In [90]:
bronze = df[df['tier'] == 'BRONZE'].sample(7000, random_state=random_state)
gold = df[df['tier'] == 'GOLD'].sample(7000, random_state=random_state)
diamond = df[df['tier'] == 'DIAMOND'].sample(7000, random_state=random_state)
gm = df[df['tier'] == 'GRANDMASTERS'].sample(7000, random_state=random_state)
df = bronze.append([gold, diamond, gm])
df.shape[0]
print('Sample class distribution:\nBlue:' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[0]/df.shape[0])*100)
      + '\nRed:' + '%.2f%%' % ((df.groupby('winning_team').size().tolist()[1]/df.shape[0])*100))

Sample class distribution:
Blue:50.25%
Red:49.75%
