# Objectives 
* Determine if logistic regression can be done with a non-gigantic amount of variables, and if so, which variables
* Find patterns in the data and visualize them

First, we import the necessary libraries and get a first look at the data.

In [50]:
import pandas as pd
# probably import matplotlib and seaborn later

# + blue side lead, - red side lead, warding numbers wonky, timer variables in seconds

In [51]:
df = pd.read_csv('hackathon-riot-data/hackathon-riot-data.csv', sep=';')
pd.set_option('display.max_columns', None)
df.rename(columns={'GoldDiff10Bot':'GoldDiffSup10', 
                   'XPDiff10Bot':'XPDiffSup10',
                   'GoldDiff15Bot':'GoldDiffSup15',
                   'XPDiff15Bot':'XPDiffSup15',
                   'GoldDiffEndBot':'GoldDiffEndSup',
                   'XPDiffEndBot':'XPDiffEndSup'}, inplace=True)

In [52]:
# Unnamed is a redundant column same thing as index, ID doesn't influence who wins, ward numbers are unreliable
# several columns like camps secured, number of drakes, scuttles, heralds, are all 0 and redundant anyways
df.drop(columns=['Unnamed: 0', 
                 'esportsPlatformId', 
                 'NbWardsPlacedBlue', 
                 'NbWardsPlacedRed', 
                 'NbControlWardsPlacedBlue', 
                 'NbControlWardsPlacedRed', 
                 'NbWardsKilledBlue', 
                 'NbWardsKilledRed', 
                 'NbControlWardsKilledBlue', 
                 'NbControlWardsKilledRed', 
                 'NbCampsSecuredBlue', 
                 'NbCampsSecuredRed',
                 'NbScuttlesBlue',
                 'NbScuttlesRed',
                 'NbRiftHeraldsBlue',
                 'NbRiftHeraldsRed', 
                 'NbDragonsBlue',
                 'NbDragonsRed',
                 'NbBaronsBlue',
                 'NbBaronsRed',
                 'NbEldersBlue',
                 'NbEldersRed',
                 'DragonSoulTimer',
                 'DragonSoulType',
                 'DragonSoulTaker'], inplace=True)

First, we need to check for missing values and address them appropriately, see if any can be replaced or if the row they belong to must be dropped.

In [53]:
col_list = df.columns.tolist()

for col in col_list:
    if df[col].isnull().sum() != 0:
        print(col.ljust(25), str(df[col].isnull().sum()).rjust(5))

gameDate                      3
gameVersion                   3
gameDuration                  3
OuterTopBlueTimer          3105
OuterMidBlueTimer          4976
OuterBotBlueTimer          2874
InnerTopBlueTimer         11699
InnerMidBlueTimer          8947
InnerBotBlueTimer         10211
BaseTopBlueTimer          15866
BaseMidBlueTimer          11583
BaseBotBlueTimer          14357
Nexus1MidBlueTimer        10208
Nexus2MidBlueTimer        10351
OuterTopRedTimer           2221
OuterMidRedTimer           3524
OuterBotRedTimer           2409
InnerTopRedTimer          10169
InnerMidRedTimer           7454
InnerBotRedTimer           9202
BaseTopRedTimer           15082
BaseMidRedTimer           10425
BaseBotRedTimer           13452
Nexus1MidRedTimer          9035
Nexus2MidRedTimer          9232
GoldTopBlue10                 3
GoldJgBlue10                  3
GoldMidBlue10                 3
GoldADBlue10                  3
GoldSupBlue10                 3
GoldTopRed10                  3
GoldJgRe

In [54]:
# can't use a row without the winner label since that's what is being predicted
missing_label_row = (df['winner'].isna() == True)
df.drop(index=df[missing_label_row].index, inplace=True)

# 3 random rows with a lot of NaNs
nan_rows = (df['gameDate'].isna() == True)
df.drop(index=df[nan_rows].index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [55]:
# NaN represents a turret was never taken, so it can be replaced with the game duration
# doing this with a loop because of some setting copy error

for col in col_list:
    if df[col].isnull().sum() != 0:
        df[col].fillna(df['gameDuration'], inplace=True)

We can reduce the remaining 174 columns by consolidating. For example, we are given various stats like gold, xp, vision score, cc duration, damage dealt, and damage taken for each role. These can be totaled. Consolidation will also make modeling easier. There may be nuances to consider, such as possibly keeping the gold of adc separate since esports games favor funneling the adc, especially if more competitive tournaments tend to have longer matches (adc shines late game), but for now we simply want an mvp.

In [56]:
# columns for total gold already exist
df.drop(columns=['GoldTopBlue10', 'GoldJgBlue10', 'GoldMidBlue10', 'GoldADBlue10', 'GoldSupBlue10',
                 'GoldTopRed10', 'GoldJgRed10', 'GoldMidRed10', 'GoldADRed10', 'GoldSupRed10',
                 'GoldTopBlue15', 'GoldJgBlue15', 'GoldMidBlue15', 'GoldADBlue15', 'GoldSupBlue15',
                 'GoldTopRed15', 'GoldJgRed15', 'GoldMidRed15', 'GoldADRed15', 'GoldSupRed15',
                 'GoldTopBlueEnd', 'GoldJgBlueEnd', 'GoldMidBlueEnd', 'GoldADBlueEnd', 'GoldSupBlueEnd',
                 'GoldTopRedEnd', 'GoldJgRedEnd', 'GoldMidRedEnd', 'GoldADRedEnd', 'GoldSupRedEnd'], inplace=True)

#3 outliers not going to predict anything because these values are 0 for the rest of the rows
df.drop(columns=['BlueInhibKills15', 'RedInhibKills15'], inplace=True)

# calculating totals then dropping the columns for individual stats
df['GoldDiff10'] = df['BlueTotalGold10'] - df['RedTotalGold10']
df.drop(columns=['GoldDiff10Top', 'GoldDiff10Jg', 'GoldDiff10Mid', 'GoldDiff10AD', 'GoldDiffSup10'], inplace=True)

df['XPDiff10'] = df['XPDiff10Top'] + df['XPDiff10Jg'] + df['XPDiff10Mid'] + df['XPDiff10AD'] + df['XPDiffSup10']
df.drop(columns=['XPDiff10Top', 'XPDiff10Jg', 'XPDiff10Mid', 'XPDiff10AD', 'XPDiffSup10'], inplace=True)

df['GoldDiff15'] = df['BlueTotalGold15'] - df['RedTotalGold15']
df.drop(columns=['GoldDiff15Top', 'GoldDiff15Jg', 'GoldDiff15Mid', 'GoldDiff15AD', 'GoldDiffSup15'], inplace=True)

df['XPDiff15'] = df['XPDiff15Top'] + df['XPDiff15Jg'] + df['XPDiff15Mid'] + df['XPDiff15AD'] + df['XPDiffSup15']
df.drop(columns=['XPDiff15Top', 'XPDiff15Jg', 'XPDiff15Mid', 'XPDiff15AD', 'XPDiffSup15'], inplace=True)

df['GoldDiffEnd'] = df['BlueTotalGoldEnd'] - df['RedTotalGoldEnd']
df.drop(columns=['GoldDiffEndTop', 'GoldDiffEndJg', 'GoldDiffEndMid', 'GoldDiffEndAD', 'GoldDiffEndSup'], inplace=True)

df['XPDiffEnd'] = df['XPDiffEndTop'] + df['XPDiffEndJg'] + df['XPDiffEndMid'] + df['XPDiffEndAD'] + df['XPDiffEndSup']
df.drop(columns=['XPDiffEndTop', 'XPDiffEndJg', 'XPDiffEndMid', 'XPDiffEndAD', 'XPDiffEndSup'], inplace=True)

df['VisionScoreBlue'] = df['VisionScoreTopBlue'] + df['VisionScoreJgBlue'] + df['VisionScoreMidBlue'] + df['VisionScoreADBlue'] + df['VisionScoreSupBlue']
df.drop(columns=['VisionScoreTopBlue', 'VisionScoreJgBlue', 'VisionScoreMidBlue', 'VisionScoreADBlue', 'VisionScoreSupBlue'], inplace=True)

df['VisionScoreRed'] = df['VisionScoreTopRed'] + df['VisionScoreJgRed'] + df['VisionScoreMidRed'] + df['VisionScoreADRed'] + df['VisionScoreSupRed']
df.drop(columns=['VisionScoreTopRed', 'VisionScoreJgRed', 'VisionScoreMidRed', 'VisionScoreADRed', 'VisionScoreSupRed'], inplace=True)

df['DmgDealtBlue'] = df['DamageDealtTopBlue'] + df['DamageDealtJgBlue'] + df['DamageDealtMidBlue'] + df['DamageDealtADBlue'] + df['DamageDealtSupBlue']
df.drop(columns=['DamageDealtTopBlue', 'DamageDealtJgBlue', 'DamageDealtMidBlue', 'DamageDealtADBlue', 'DamageDealtSupBlue'], inplace=True)

df['DmgDealtRed'] = df['DamageDealtTopRed'] + df['DamageDealtJgRed'] + df['DamageDealtMidRed'] + df['DamageDealtADRed'] + df['DamageDealtSupRed']
df.drop(columns=['DamageDealtTopRed', 'DamageDealtJgRed', 'DamageDealtMidRed', 'DamageDealtADRed', 'DamageDealtSupRed'], inplace=True)

df['DmgTakenBlue'] = df['DamageTakenTopBlue'] + df['DamageTakenJgBlue'] + df['DamageTakenMidBlue'] + df['DamageTakenADBlue'] + df['DamageTakenSupBlue']
df.drop(columns=['DamageTakenTopBlue', 'DamageTakenJgBlue', 'DamageTakenMidBlue', 'DamageTakenADBlue', 'DamageTakenSupBlue'], inplace=True)

df['DmgTakenRed'] = df['DamageTakenTopRed'] + df['DamageTakenJgRed'] + df['DamageTakenMidRed'] + df['DamageTakenADRed'] + df['DamageTakenSupRed']
df.drop(columns=['DamageTakenTopRed', 'DamageTakenJgRed', 'DamageTakenMidRed', 'DamageTakenADRed', 'DamageTakenSupRed'], inplace=True)

df['TotalCCBlue'] = df['TotalCCDurationTopBlue'] + df['TotalCCDurationJgBlue'] + df['TotalCCDurationMidBlue'] + df['TotalCCDurationADBlue'] + df['TotalCCDurationSupBlue']
df.drop(columns=['TotalCCDurationTopBlue', 'TotalCCDurationJgBlue', 'TotalCCDurationMidBlue', 'TotalCCDurationADBlue', 'TotalCCDurationSupBlue'], inplace=True)

df['TotalCCRed'] = df['TotalCCDurationTopRed'] + df['TotalCCDurationJgRed'] + df['TotalCCDurationMidRed'] + df['TotalCCDurationADRed'] + df['TotalCCDurationSupRed']
df.drop(columns=['TotalCCDurationTopRed', 'TotalCCDurationJgRed', 'TotalCCDurationMidRed', 'TotalCCDurationADRed', 'TotalCCDurationSupRed'], inplace=True)

The data went from 198 columns to 86 columns. Next, the data needs to be standardized and normalized for accurate comparisons of distributions.

In [57]:
df

Unnamed: 0,gameDate,gameVersion,gameDuration,NbCampsStolenBlue,NbCampsStolenRed,NbTowersBlue,NbTowersRed,NbPlatesBlue,NbPlatesRed,OuterTopBlueTimer,OuterMidBlueTimer,OuterBotBlueTimer,InnerTopBlueTimer,InnerMidBlueTimer,InnerBotBlueTimer,BaseTopBlueTimer,BaseMidBlueTimer,BaseBotBlueTimer,Nexus1MidBlueTimer,Nexus2MidBlueTimer,OuterTopRedTimer,OuterMidRedTimer,OuterBotRedTimer,InnerTopRedTimer,InnerMidRedTimer,InnerBotRedTimer,BaseTopRedTimer,BaseMidRedTimer,BaseBotRedTimer,Nexus1MidRedTimer,Nexus2MidRedTimer,BlueKills10,BlueAssists10,BlueDeaths10,BlueTotalGold10,BlueDragonKills10,BlueTowerKills10,RedKills10,RedAssists10,RedDeaths10,RedTotalGold10,RedDragonKills10,RedTowerKills10,BlueKills15,BlueAssists15,BlueDeaths15,BlueTotalGold15,BlueDragonKills15,BlueTowerKills15,RedKills15,RedAssists15,RedDeaths15,RedTotalGold15,RedDragonKills15,RedTowerKills15,BlueKillsEnd,BlueAssistsEnd,BlueDeathsEnd,BlueTotalGoldEnd,BlueDragonKillsEnd,BlueTowerKillsEnd,RedKillsEnd,RedAssistsEnd,RedDeathsEnd,RedTotalGoldEnd,RedDragonKillsEnd,RedTowerKillsEnd,BlueInhibKillsEnd,RedInhibKillsEnd,BlueBaronKillsEnd,RedBaronKillsEnd,winner,GoldDiff10,XPDiff10,GoldDiff15,XPDiff15,GoldDiffEnd,XPDiffEnd,VisionScoreBlue,VisionScoreRed,DmgDealtBlue,DmgDealtRed,DmgTakenBlue,DmgTakenRed,TotalCCBlue,TotalCCRed
0,2023-06-18 18:14:11.781000+00:00,13.11.512.8126,2068.268,0,2,10,3,10,4,1296.206,1235.483,1067.770,1516.094,1677.850,1698.828,2037.578,2068.268,1716.198,2050.712,2054.181,2068.268,1513.456,1425.611,2068.268,2068.268,1836.628,2068.268,2068.268,2068.268,2068.268,2068.268,1.0,0.0,2.0,7708.0,0.0,0.0,2.0,4.0,1.0,9062.0,0.0,0.0,3.0,1.0,9.0,14451.0,0.0,0.0,9.0,10.0,3.0,18312.0,0.0,0.0,17.0,29.0,30.0,52256.0,1.0,3.0,30.0,45.0,17.0,62071.0,2.0,11.0,0.0,1.0,0.0,1.0,red,-1354.0,-818.0,-3861.0,-2022.0,-9815.0,-7128.0,156.724674,203.646418,76784.784668,111060.248535,131734.996094,112710.541992,74.377606,98.058840
1,2023-02-16 01:59:23.961000+00:00,13.3.491.6222,1999.819,0,1,9,3,9,5,874.104,1062.107,850.434,1715.034,1459.844,1543.439,1999.819,1999.819,1558.564,1989.451,1992.002,1078.645,1871.251,1472.922,1999.819,1999.819,1999.819,1999.819,1999.819,1999.819,1999.819,1999.819,0.0,0.0,0.0,13817.0,0.0,0.0,0.0,0.0,0.0,14737.0,0.0,0.0,3.0,6.0,2.0,23647.0,0.0,0.0,2.0,6.0,3.0,24921.0,1.0,2.0,5.0,12.0,15.0,53641.0,0.0,2.0,15.0,41.0,5.0,65643.0,4.0,10.0,0.0,2.0,0.0,2.0,red,-920.0,-434.0,-1274.0,-699.0,-12002.0,-11571.0,238.711369,298.544613,75103.252197,93868.016602,118238.287109,116021.859375,106.651195,158.449711
2,2021-03-13 21:07:56.263000+00:00,11.5.361.5971,1882.763,5,0,0,9,2,7,1882.763,1882.763,1882.763,1882.763,1882.763,1882.763,1882.763,1882.763,1882.763,1882.763,1882.763,678.310,925.578,998.802,1069.430,1344.748,1439.473,1882.763,1852.737,1882.763,1866.462,1873.699,2.0,3.0,1.0,15697.0,0.0,0.0,1.0,2.0,2.0,14975.0,0.0,0.0,3.0,5.0,3.0,25613.0,0.0,1.0,3.0,5.0,3.0,23295.0,1.0,0.0,14.0,34.0,6.0,59369.0,2.0,9.0,6.0,14.0,14.0,48123.0,2.0,2.0,1.0,0.0,1.0,0.0,blue,722.0,1067.0,2318.0,780.0,11246.0,9486.0,264.830082,224.835535,51793.047607,44358.151367,83805.064453,77475.759766,80.227708,77.148347
3,2021-04-10 11:56:11.980000+00:00,11.6.364.6814,1551.344,1,2,1,7,4,4,1064.776,1551.344,1551.344,1551.344,1551.344,1551.344,1551.344,1551.344,1551.344,1551.344,1551.344,1002.608,1207.684,938.541,1551.344,1505.502,1551.344,1551.344,1512.203,1551.344,1532.360,1537.181,5.0,11.0,2.0,16408.0,1.0,0.0,2.0,3.0,5.0,14516.0,0.0,0.0,6.0,14.0,5.0,24612.0,1.0,0.0,5.0,13.0,6.0,23792.0,1.0,0.0,19.0,47.0,9.0,49985.0,3.0,7.0,9.0,20.0,19.0,40234.0,1.0,1.0,1.0,0.0,1.0,0.0,blue,1892.0,893.0,820.0,-1012.0,9751.0,10209.0,163.739124,154.203134,47477.142334,38891.236572,66697.954102,69003.676758,68.832772,72.509094
4,2023-06-08 19:47:14.554000+00:00,13.10.509.8402,2246.940,1,3,10,4,8,4,811.847,1694.161,1289.116,1981.716,2058.889,1330.069,2246.940,2224.408,1799.446,2233.377,2234.679,1296.290,1192.724,952.274,2246.940,1308.559,2246.940,2246.940,2246.940,2246.940,2246.940,2246.940,2.0,1.0,1.0,15673.0,0.0,0.0,1.0,0.0,2.0,14963.0,0.0,0.0,5.0,6.0,3.0,24732.0,0.0,0.0,3.0,2.0,5.0,25171.0,0.0,1.0,10.0,20.0,12.0,58527.0,2.0,4.0,12.0,25.0,10.0,68072.0,2.0,9.0,0.0,1.0,0.0,1.0,red,710.0,-61.0,-439.0,1413.0,-9545.0,-12037.0,269.936127,277.164185,68731.898682,76759.528809,107434.735352,102052.777344,77.959497,103.047701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19737,2023-07-26 15:59:21.957000+00:00,13.14.522.821,2475.365,4,6,5,7,3,8,1643.983,1668.003,1246.403,2295.821,2475.365,2006.084,2475.365,2475.365,2475.365,2475.365,2475.365,663.534,1325.457,1765.796,2475.365,2475.365,2449.743,2475.365,2475.365,2457.277,2467.604,2468.237,2.0,2.0,4.0,13352.0,0.0,0.0,4.0,4.0,2.0,14350.0,0.0,0.0,2.0,2.0,6.0,22374.0,0.0,1.0,6.0,10.0,2.0,23253.0,1.0,0.0,21.0,44.0,21.0,73324.0,3.0,7.0,21.0,47.0,21.0,71947.0,3.0,7.0,1.0,0.0,2.0,1.0,blue,-998.0,-393.0,-879.0,-1338.0,1377.0,3900.0,363.980244,372.348934,107750.124023,94517.742188,128364.878906,146959.968750,119.299989,98.549594
19738,2022-08-30 16:35:42.548000+00:00,12.15.458.1416,1522.371,1,1,1,10,4,3,1522.371,1522.371,984.322,1522.371,1522.371,1522.371,1522.371,1522.371,1522.371,1522.371,1522.371,1033.842,1340.596,1462.097,1393.394,1351.633,1489.526,1403.443,1522.371,1502.048,1504.691,1505.089,0.0,0.0,0.0,12656.0,0.0,0.0,0.0,0.0,0.0,12813.0,1.0,0.0,4.0,8.0,3.0,22012.0,0.0,0.0,3.0,5.0,4.0,22666.0,1.0,0.0,17.0,39.0,6.0,49051.0,1.0,9.0,6.0,14.0,17.0,38432.0,2.0,1.0,1.0,0.0,1.0,0.0,blue,-157.0,-1127.0,-654.0,-1709.0,10619.0,8948.0,140.733849,139.382868,64050.456055,46774.285156,65328.648438,80371.787109,43.659249,86.770056
19739,2023-04-11 23:15:16+00:00,13.5.495.8836,2103.001,0,3,9,3,7,2,1232.620,1284.520,1048.057,1657.929,1559.003,1545.116,2048.649,2103.001,2103.001,2066.574,2093.923,1273.129,1143.783,1602.549,2103.001,2103.001,2103.001,2103.001,2103.001,2103.001,2103.001,2103.001,1.0,2.0,2.0,10964.0,0.0,0.0,2.0,4.0,1.0,11985.0,1.0,0.0,5.0,13.0,3.0,19828.0,0.0,0.0,3.0,6.0,5.0,19832.0,2.0,0.0,7.0,18.0,15.0,50099.0,1.0,3.0,15.0,43.0,7.0,62108.0,4.0,9.0,0.0,1.0,0.0,2.0,red,-1021.0,-271.0,-4.0,-160.0,-12009.0,-12698.0,236.265320,285.348082,50280.660645,77062.770020,95487.943359,75354.740723,76.751682,93.630685
19740,2023-07-15 19:10:14.307000+00:00,13.13.518.1870,1910.456,0,0,10,4,5,7,1285.018,991.842,1459.379,1581.662,1477.936,1473.472,1778.734,1910.456,1870.214,1880.699,1898.885,781.048,1815.832,1168.052,1910.456,1910.456,1553.468,1910.456,1910.456,1910.456,1910.456,1910.456,1.0,1.0,1.0,14842.0,0.0,0.0,1.0,1.0,1.0,15390.0,1.0,0.0,5.0,5.0,2.0,25336.0,0.0,1.0,2.0,4.0,5.0,23626.0,2.0,0.0,8.0,12.0,21.0,51878.0,0.0,3.0,21.0,72.0,8.0,62455.0,5.0,10.0,0.0,1.0,0.0,2.0,red,-548.0,-535.0,1710.0,747.0,-10577.0,-15111.0,242.919640,285.430437,57810.262451,104446.147461,125335.755859,88453.358398,137.874025,134.679596


In [60]:
df['NbCampsStolenBlue'].groupby

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000023A877A09D0>

In [59]:
df['NbCampsStolenRed'].describe()

count    19742.000000
mean         1.341050
std          1.365237
min          0.000000
25%          0.000000
50%          1.000000
75%          2.000000
max          9.000000
Name: NbCampsStolenRed, dtype: float64

In [None]:
#df.iloc[19000].to_frame().T to get a row
# nan_rows = df[df[['gameDate']].isna().any(axis=1)] to get rows with NaN in given column

In [46]:
# durbin watson test for autocorrelation, consider time series analysis
# df.columns.tolist()