# Who's the Better Carry: ADC or Midlaner?

**Name(s)**: Katelyn Villamin and Nancy Shen

**Website Link**: https://skvillamin.github.io/league-predictions/

In [37]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import *

## Step 1: Introduction

Our dataset is on the game statistics of 2022 professional League of Legends games. With a massive dataset of over 150,000 rows, we decided to focus our research on answering the question: **Which role carries more often: Mid or ADC?** 

Although every role plays an integral part in a game's success, the overall deciding factor is the ability of the high-damage output heroes to out-damage the opponents. With differing opinions on the star of the team, fellow League of Legends players may be interested in putting a conclusion to this community debate. 

The columns that help us answer, **"Which role carries more often: Mid or ADC?"** are: `position`, `kills`, `total damage dealt`, `result`, and `first blood kill`. As doing as much damage is important for these roles, these columns showed us: Based on the game result, who ended up doing the most amount of damage. After cleaning the data, we narrowed it to 12 columns and 144804 rows for our analysis' DataFrame `league` and 8 columns and 48268 rows for our predictive model's DataFrame `mid_adc`, a subDataFrame of `league`.


## Step 2: Data Cleaning and Exploratory Data Analysis

In [38]:
#import dataset from data folder
league_fp = Path('data') / '2022_LoL_esports_match_data_from_OraclesElixir.csv'
league = pd.read_csv(league_fp, low_memory=False)

#isolate the columns needed
columns_needed = ['year', 'league', 'position', 'champion', 'kills', 'doublekills', 'damagetochampions', 'damagetakenperminute', 'damagemitigatedperminute','firstbloodkill', 'monsterkills', 'wardsplaced', 'xpat10', 'towers', 'result']
league = league[columns_needed]

#since we're looking at the 2022 dataset make sure all the games are completed in 2022
league = league[league['year'] == 2022]
league= league.drop(columns= 'year')

#change all the position column values that are 'bot' to 'adc' for clarity
league['position'] = league['position'].str.replace('bot', 'adc')

#change column names to more appropriate ones
league.rename(columns={
    "damagetochampions": "total damage dealt", 
    "firstbloodkill": "first blood kill",
    'wardsplaced': 'wards placed', 
    'monsterkills': 'monster kills', 
    'damagetakenperminute': 'dtpm',
    'damagemitigatedperminute': 'dmpm',
    'xpat10' : 'exp at 10'
    }, inplace=True)

#change 'first blood kill' and 'result' to booleans
league["first blood kill"] = league["first blood kill"].astype(bool)
league["result"] = league["result"].astype(bool)
league


Unnamed: 0,league,position,champion,kills,...,wards placed,exp at 10,towers,result
0,LCKC,top,Renekton,2,...,8.0,4909.0,,False
1,LCKC,jng,Xin Zhao,2,...,6.0,3484.0,,False
2,LCKC,mid,LeBlanc,2,...,19.0,4556.0,,False
...,...,...,...,...,...,...,...,...,...
148989,DCup,sup,Ashe,2,...,62.0,,,True
148990,DCup,team,,8,...,108.0,,0.0,False
148991,DCup,team,,21,...,122.0,,10.0,True


In [39]:
#create a df with just mid and adc positions
mid_adc = league[(league['position'].str.lower() == 'adc') | (league['position'].str.lower() == 'mid')]
mid_adc= mid_adc[['league', 'position', 'kills', 'doublekills', 'total damage dealt', 'dtpm', 'dmpm', 'first blood kill', 'towers', 'result']]
mid_adc

Unnamed: 0,league,position,kills,doublekills,...,dmpm,first blood kill,towers,result
2,LCKC,mid,2,0.0,...,227.78,False,,False
3,LCKC,adc,2,0.0,...,218.88,False,,False
7,LCKC,mid,6,2.0,...,426.94,False,,True
...,...,...,...,...,...,...,...,...,...
148983,DCup,adc,2,,...,,False,,False
148987,DCup,mid,6,,...,,False,,True
148988,DCup,adc,7,,...,,False,,True


**Univariate Analysis**

In [40]:
#get the killcounts for mid and adc positions
mid_kills = mid_adc[mid_adc['position'] == 'mid']['kills'].value_counts()
adc_kills = mid_adc[mid_adc['position'] == 'adc']['kills'].value_counts()

#create new df with only mid_kills and adc_kills data
kills_distr = pd.DataFrame()
kills_distr['adc'] = adc_kills
kills_distr['mid'] = mid_kills

#fill NaN values with 0
kills_distr['mid'] = kills_distr['mid'].fillna(0)
kills_distr['adc'] = kills_distr['adc'].fillna(0)


#reset index so that we can see the number of times each role got a certain amount of kills
kills_distr = kills_distr.reset_index()

#rename the index column 
kills_distr.rename(columns={'index': 'num kills'}, inplace=True)

#sort df values so that when plotting, the values will create a curve
kills_distr = kills_distr.sort_values(by='num kills')

kills_distr

Unnamed: 0,num kills,adc,mid
5,0,2272,2617.0
0,1,3151,3831.0
1,2,3141,3949.0
...,...,...,...
21,22,3,0.0
24,24,1,0.0
23,28,1,0.0


In [41]:
#plot the line graph 
fig = px.line(kills_distr, x = 'num kills', y = ['adc', 'mid'],  color_discrete_map={'adc': 'pink', 'mid': 'purple'})
fig.update_layout(title='Distribution of Kill Counts for Mid and ADC Players',
                   xaxis_title='Number of Kills',
                   yaxis_title='Number of Occurrences',
                   showlegend=True,
                   legend_title_text=None)

**Bivariate Analysis**

In [42]:
#separate when mid players got a first kill and when they didnt
mid_first= mid_adc[(mid_adc['position']== 'mid') & (mid_adc['first blood kill']== True)]
mid_nofirst= mid_adc[(mid_adc['position']== 'mid') & (mid_adc['first blood kill']== False)]

#separate when adc players got a first kill and when they didnt
adc_first= mid_adc[(mid_adc['position']== 'adc') & (mid_adc['first blood kill']== True)]
adc_nofirst= mid_adc[(mid_adc['position']== 'adc') & (mid_adc['first blood kill']== False)]

#get the total number of first blood kills of mid and adc players
total=mid_adc['first blood kill'].count()

#add values into a df
kills_prop = pd.DataFrame()
kills_prop['position'] = ['mid', 'adc']
kills_prop['kill'] =  [(mid_first['first blood kill'].count()/total)*100, (adc_first['first blood kill'].count()/total)*100]
kills_prop['no kill'] = [(mid_nofirst['first blood kill'].count()/total)*100, (adc_nofirst['first blood kill'].count()/total)*100]
kills_prop

Unnamed: 0,position,kill,no kill
0,mid,4.43,45.57
1,adc,5.84,44.16


In [43]:
#plot the bar graph
bar = px.bar(kills_prop, x='position', y=['kill', 'no kill'], barmode='group', color_discrete_map={'kill': 'teal', 'no kill': 'lightblue'})
bar.update_layout(title='Distribution of Getting First Blood Kill Given a Position',
                   xaxis_title='Position',
                   yaxis_title='First Blood Kills (%)',
                   showlegend=True,
                   legend_title_text=None)

**Interesting Aggregates**

This table shows the mean statistics for ADC and mid players in dataset based on the result of the game (where False means that they lost, and True means that they won). Finding the means of these columns allows us to see which role carries more often.

In [44]:
#find the mean statistics for adc and mid players
aggregates = mid_adc.groupby(['position', 'result']).mean()

#drop towers because towers only applies to 'position' == 'team'
aggregates = aggregates.drop(columns='towers')
aggregates

Unnamed: 0_level_0,Unnamed: 1_level_0,kills,doublekills,total damage dealt,dtpm,dmpm,first blood kill
position,result,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
adc,False,2.58,0.28,15940.83,451.84,284.47,0.09
adc,True,5.94,1.04,20146.39,390.16,277.94,0.14
mid,False,2.25,0.18,15630.71,552.9,372.75,0.07
mid,True,4.74,0.64,19188.51,491.35,367.04,0.11


## Step 3: Assessment of Missingness

In [45]:
#make a separate df with only data for position == team so we don't get duplicates
teams_only = league[league['position']== 'team']
teams_only

Unnamed: 0,league,position,champion,kills,...,wards placed,exp at 10,towers,result
10,LCKC,team,,9,...,74.0,18213.0,3.0,False
11,LCKC,team,,19,...,93.0,18076.0,6.0,True
22,LCKC,team,,3,...,119.0,17462.0,3.0,False
...,...,...,...,...,...,...,...,...,...
148979,DCup,team,,7,...,136.0,,2.0,False
148990,DCup,team,,8,...,108.0,,0.0,False
148991,DCup,team,,21,...,122.0,,10.0,True


### **Double Kills vs League**


When Double Kills is Missing. We use data of `teams_only`

**Permutation Test of League vs. Doublekills**

In [46]:
#create the df for the permutation test
league_double = teams_only[['league', 'doublekills']]
league_double['is_missing'] = league_double['doublekills'].isna()
league_double

Unnamed: 0,league,doublekills,is_missing
10,LCKC,0.0,False
11,LCKC,6.0,False
22,LCKC,0.0,False
...,...,...,...
148979,DCup,,True
148990,DCup,,True
148991,DCup,,True


In [47]:
#create pivot table for permutation tests
missing_distr = (
    league_double
    .assign(double_missing=league_double['doublekills'].isna())
    .pivot_table(index='league', columns='double_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
missing_distr.columns = ['double_missing = False', 'double_missing = True']
missing_distr['double_missing = True'] = missing_distr['double_missing = True'].fillna(0)

missing_distr = missing_distr / missing_distr.sum()
missing_distr

Unnamed: 0_level_0,double_missing = False,double_missing = True
league,Unnamed: 1_level_1,Unnamed: 2_level_1
CBLOL,2.37e-02,0.00e+00
CBLOLA,2.11e-02,0.00e+00
CDF,7.12e-03,0.00e+00
...,...,...
VCS,3.15e-02,0.00e+00
VL,1.52e-02,0.00e+00
WLDs,1.38e-02,7.70e-03


In [48]:
#calculate the observed tvd for this permutation test
observed_tvd_league = missing_distr.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd_league

0.4961517317207257

In [49]:
#run permutation test
n_repetitions = 500
shuffled_league = league_double.copy()
shuffled_league= shuffled_league.assign(double_missing=league_double['doublekills'].isna())

tvds_league = []
for _ in range(n_repetitions):
    
    
    shuffled_league['league'] = np.random.permutation(shuffled_league['league'])
    
    pivoted_league = (
        shuffled_league
        .pivot_table(index='league', columns='double_missing', aggfunc='size')
    )
    
    pivoted_league = pivoted_league / pivoted_league.sum()
    
    tvd = pivoted_league.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds_league.append(tvd)

In [50]:
#calculate the p-value for this permutation test
pval = (pd.Series(tvds_league) >= observed_tvd_league).mean()
pval

0.0

In [51]:
#plot distribution
fig = px.histogram(pd.DataFrame(tvds_league), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd_league, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd_league, 2)}</span>', showarrow=False, y=0.05)
fig.update_layout(yaxis_range=[0, 0.1], xaxis_range = [0, 0.5])

# Show the plot
fig.show()

## Step 3: Assessment of Missingness

### **DoubleKills vs Towers: Missing at Random**

- When DoubleKills is Missing

In [52]:
#create df to run permutation test on towers and doublekills
towers_double = teams_only[['towers','doublekills' ]]
towers_double['is_missing'] = league_double['doublekills'].isna()

In [53]:
#create the pivot table for to permuatate towers and doublekills
towers_double = teams_only[['towers', 'doublekills']]
towers_double_dist = (
    towers_double
    .assign(double_missing=towers_double['doublekills'].isna())
    .pivot_table(index='towers', columns='double_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
towers_double_dist.columns = ['tower_missing = False', 'tower_missing = True']
towers_double_dist['tower_missing = True'] = towers_double_dist['tower_missing = True'].fillna(0)


towers_double_dist = towers_double_dist / towers_double_dist.sum()
towers_double_dist

Unnamed: 0_level_0,tower_missing = False,tower_missing = True
towers,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.05,0.04
1.0,0.08,0.09
2.0,0.11,0.11
...,...,...
9.0,0.14,0.15
10.0,0.12,0.13
11.0,0.11,0.09


In [54]:
#calculate the observed tvd for this permutation test
observed_tvd_tower = towers_double_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd_tower

0.03472890994914897

In [55]:
#run the permutation test
n_repetitions = 10_000

#create copy of towers
shuffled_towers = towers_double.copy()

#create new column for shuffled missing values
shuffled_towers= shuffled_towers.assign(double_missing=towers_double['doublekills'].isna())

tvds_towers = []
for _ in range(n_repetitions):
    shuffled_towers['towers'] = np.random.permutation(shuffled_towers['towers'])
    
    #pivot the shuffled df
    pivoted_towers = (
        shuffled_towers
        .pivot_table(index='towers', columns='double_missing', aggfunc='size')
    )
    
    #get the proportion of each tower
    pivoted_towers = pivoted_towers / pivoted_towers.sum()
    
    #calculate the TVD of the pivoted towers df
    tvd = pivoted_towers.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds_towers.append(tvd)

In [56]:
#calculate the p-value for this permutation test
pval = (pd.Series(tvds_towers) >= observed_tvd_tower).mean()
pval

0.0235

In [57]:
#plot the graph
fig = px.histogram(pd.DataFrame(tvds_towers), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
#add line for observed TVD
fig.add_vline(x=observed_tvd_tower, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd_tower, 2)}</span>',
                   x=2.3 * observed_tvd_tower, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])
fig.layout.annotations[-1].update(x=fig.layout.annotations[-1].x - 0.038, y=fig.layout.annotations[-1].y + 0.0075)

# Show the plot
fig.show()

## Step 4: Hypothesis Testing

H0: The distribution of games won where ADC players had more damage on average than mid players is equal to the distribution of games won where mid players had more damage than ADC players

H1: The distribution of games won where ADC players had more damage on average than mid players is not equal to the distribution of games won where mid players had more damage than ADC players

In [58]:
#create new df from mid_adc df with result == True
wins_only = mid_adc[mid_adc['result'] == True]
wins_only.dropna(subset=['total damage dealt'], inplace=True)
#isolate columns needed
wins_only = wins_only[['league', 'position', 'total damage dealt']]

#create array where we have two identical values representing each game
arr = np.arange(0, wins_only.shape[0] / 2)
arr = np.repeat(arr, 2)

#assign array to df
wins_only['game no.'] = arr.astype(int)
wins_only


Unnamed: 0,league,position,total damage dealt,game no.
7,LCKC,mid,20690.0,0
8,LCKC,adc,26687.0,0
19,LCKC,mid,11261.0,1
...,...,...,...,...
148971,DCup,adc,30424.0,12062
148987,DCup,mid,19185.0,12063
148988,DCup,adc,28282.0,12063


In [59]:
#pivot table to see total damage dealt for mid players and adc players per game.
pivoted_wins = wins_only.pivot(columns='position', index='game no.', values='total damage dealt')
pivoted_wins

position,adc,mid
game no.,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26687.0,20690.0
1,16970.0,11261.0
2,9618.0,12577.0
...,...,...
12061,11977.0,14994.0
12062,30424.0,15279.0
12063,28282.0,19185.0


In [60]:
#create a df of games where mid has more damage than adc
mid_more_damage = pivoted_wins.loc[pivoted_wins['mid'] > pivoted_wins['adc']]
mid_more_damage= mid_more_damage.drop(columns= 'adc')
mid_more_damage['role']= 'mid'
mid_more_damage

position,mid,role
game no.,Unnamed: 1_level_1,Unnamed: 2_level_1
2,12577.0,mid
3,19759.0,mid
4,17636.0,mid
...,...,...
12055,24403.0,mid
12057,22281.0,mid
12061,14994.0,mid


In [61]:
#create a df of games where adc has more damage than mid
adc_more_damage = pivoted_wins.loc[pivoted_wins['adc'] > pivoted_wins['mid']]
adc_more_damage= adc_more_damage.drop(columns= 'mid')
adc_more_damage['role']= 'adc'
adc_more_damage

position,adc,role
game no.,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26687.0,adc
1,16970.0,adc
5,17519.0,adc
...,...,...
12060,11584.0,adc
12062,30424.0,adc
12063,28282.0,adc


Why we chose to use difference in means for our hypothesis test: both distributions have the same center and similar shapes

In [62]:
pic = px.histogram(pivoted_wins, color= 'position')
pic.update_layout(barmode='overlay')
pic.update_traces(opacity=0.5)
pic.show()

In [63]:
#Join the two pivoted columns based on position
joined= pd.concat([mid_more_damage, adc_more_damage])
#Fill the NaN values
joined= joined.fillna(0)

#Create a new column of the 'max damage dealt' and rename joined as only the role vs max damage dealt
joined['max damage dealt']= joined['adc'] + joined['mid'] 
joined= joined[['role', 'max damage dealt']]
joined

position,role,max damage dealt
game no.,Unnamed: 1_level_1,Unnamed: 2_level_1
2,mid,12577.0
3,mid,19759.0
4,mid,17636.0
...,...,...
12060,adc,11584.0
12062,adc,30424.0
12063,adc,28282.0


In [64]:
#Observed Absolute Difference in Means of Max Damage Dealt between Mid and ADC
observed_diff_dmg = joined.groupby('role')['max damage dealt'].mean().diff().abs().iloc[-1]
observed_diff_dmg

545.821638930971

In [65]:
#Permutation to test our Hypothesis
n_repetitions = 10000

differences_dmg = []
for _ in range(n_repetitions):
    #Shuffle the column 'max damage dealt' and randomly permutate it in another column
    with_shuffled = joined.assign(shuffled_max_damage_dealt=np.random.permutation(joined['max damage dealt']))

    #Get the difference in means between mid and adc for the shuffled cases
    group_means = (
        with_shuffled
        .groupby('role')
        .mean()
        .loc[:, 'shuffled_max_damage_dealt']
    )
    difference = group_means.diff().abs().iloc[-1]
    
    #Store the result
    differences_dmg.append(difference)

In [66]:
#calculate the pval for hypothesis test
pval = (pd.Series(differences_dmg) >= observed_diff_dmg).mean()
pval

0.0004

In [67]:
#Plot the distribution 
fig = px.histogram(
    pd.DataFrame(differences_dmg), x=0, nbins=100, histnorm='probability', 
    title='Empirical Distribution of the TVDs between Mid and ADC Players')
fig.add_vline(x=observed_diff_dmg, line_color='red', line_width=1, opacity=1)

## Step 5: Framing a Prediction Problem

Clearly state your prediction problem and type (classification or regression). If you are building a classifier, make sure to state whether you are performing binary classification or multiclass classification. Report the response variable (i.e. the variable you are predicting) and why you chose it, the metric you are using to evaluate your model and why you chose it over other suitable metrics (e.g. accuracy vs. F1-score).

- Our prediction problem is: Given a dataset of post-game stats, we want to predict which role did they play. To predict the response variable (position), we will be using a multiclass classification. We chose the position as the response variable because there are clear defined criterias for post game statistics for us to draw our model from. 

## Step 6: Baseline Model

Columns for Baseline Model: Position, Champion, Total Damage Dealt (Champions)

In [68]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Binarizer, KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [102]:
#Create a dataframe from the original dataset pulling the columns 'position', 'champion, and 'damage to champion'
baseline = league[['position', 'champion', 'kills']]

#Filtering out the rows with position 'team'
baseline = baseline[baseline['position']!= 'team']

#get the two columns we want
baseline_pred = baseline[['champion', 'kills']]
baseline.head(20)


Unnamed: 0,position,champion,kills
0,top,Renekton,2
1,jng,Xin Zhao,2
2,mid,LeBlanc,2
...,...,...,...
19,mid,Renekton,5
20,adc,Syndra,3
21,sup,Leona,0


In [104]:
#Train-test split
X_base = baseline_pred
y_base = baseline['position']
X_base_train, X_base_test, y_base_train, y_base_test = train_test_split(X_base, y_base, test_size=0.2)

In [130]:
#create pipeline for baseline model
preproc = ColumnTransformer(
    transformers=[
        ('categorical_cols', OneHotEncoder(handle_unknown='ignore'), ['champion']), 
    ],
    remainder='passthrough'
)

# create pipeline
pl_base = Pipeline([
    ('preproc', preproc),
    ('multinomial', LogisticRegression(multi_class='multinomial', max_iter=100000))
])

# fit our data
pl_base.fit(X_base_train, y_base_train)

# predict our data
y_base_pred = pl_base.predict(X_base_test)

In [131]:
#Training Accuracy Score of Baseline Model
pl_base.score(X_base_train, y_base_train)

0.9361688903621447

In [132]:
#Testing Accuracy Score of Baseline Model
pl_base.score(X_base_test, y_base_test)

0.9365625258970747

In [133]:
#Precision Score of Baseline Model
precision_score(y_base_test, y_base_pred, average='weighted')


0.9368251141619616

## Step 7: Final Model

In [110]:
#createa df to base our final model on
final= league[['position', 'champion', 'kills', 'total damage dealt', 'wards placed', 'monster kills', 'dtpm', 'exp at 10']]
final = final[final['position']!= 'team']
final['position'] = final['position'].str.replace('bot', 'adc')

#Rename 'damagetochampions' as 'total damage dealt'
final.dropna(subset=['total damage dealt', 'wards placed', 'monster kills', 'dtpm', 'exp at 10'], inplace=True)

#drop positions because we want to predict this column
final_pred = final.drop(columns= ['position'])

In [111]:
#train test split for final model
X = final_pred
y = final['position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [112]:
#a helper function that finds the highest amout of wards placed for a team (five because there are 5 roles in a team)
def max_of_five(df):
    # Find the indices of the maximum values for every 5 rows
    max_wards = df['wards placed'].rolling(window=5).apply(lambda x: x.max(), raw=True)
    
    # Create a mask where the indices of maximum values are 1 and everything else is 0
    mask_wards = df.index.isin(max_wards)

    # Assign the mask to a new column
    df['wards placed'] = mask_wards.astype(int)

    return df

In [113]:
#Find Threshold for Binarizer
threshold= final[final['position']== 'jng']['monster kills'].mean()
threshold

144.7034543325527

* testing different modeling alogrithms:

In [114]:
# We tried Logistic Regression at first, but it was really slow and inefficient. 

# preproc_final = ColumnTransformer(
#     transformers=[
#         ('categorical_cols', OneHotEncoder(), ['champion']),
#         ('avg_monster', Binarizer(threshold=threshold), ['monster kills']),
#         ('max_five', FunctionTransformer(max_of_five), ['wards placed']),
#         ('xpat10_bins', KBinsDiscretizer(n_bins=5), ['exp at 10'])
        
#     ],
#     remainder='passthrough'
# )

# pl_final = Pipeline([
#     ('preproc', preproc_final),
#     ('multinomial', (LogisticRegression(solver='sag', multi_class='multinomial', max_iter=100000)))
# ])

# pl_final.fit(final_pred, final['position'])
# pl_final.predict(final_pred.head(20))

## Final Model

- using the better modeling algorithm: `RandomForestClassifier()`

In [115]:
# much faster! :)
preproc_final = ColumnTransformer(
    transformers=[
        ('categorical_cols', OneHotEncoder(), ['champion']),
        ('avg_monster', Binarizer(threshold=threshold), ['monster kills']),
        ('max_five', FunctionTransformer(max_of_five), ['wards placed']),
        ('xpat10_bins', KBinsDiscretizer(n_bins=5), ['exp at 10'])
        
    ],
    remainder='passthrough'
)
#create pipline for our gridsearch
pl_final = Pipeline([
    ('preproc', preproc_final),
    ('rand-forest', RandomForestClassifier(max_depth= 2, criterion= 'gini'))
])

pl_final.fit(X_train, y_train)
y_pred= pl_final.predict(X_test)

In [116]:
#choose our hyperparameters to perform GridSerach
hyperparameters = {
    'rand-forest__max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 
    'rand-forest__criterion': ['gini', 'entropy']
}

#Performing a GridSearchCV to find the optimal combination for our hyperparameters of RandomForest Classifier
grids = GridSearchCV(
    pl_final,
    n_jobs=-1, 
    param_grid=hyperparameters,
    return_train_score=True,
)

#fit our data
grids.fit(X_train, y_train)



One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan]



GridSearchCV(estimator=Pipeline(steps=[('preproc',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('categorical_cols',
                                                                         OneHotEncoder(),
                                                                         ['champion']),
                                                                        ('avg_monster',
                                                                         Binarizer(threshold=144.7034543325527),
                                                                         ['monster '
                                                                          'kills']),
                                                                        ('max_five',
                                                                         FunctionTransformer(func=<function max_of_five at 0x00000292BF8DA8

In [117]:
#Best hyperparameters to use
grids.best_params_

{'rand-forest__criterion': 'gini', 'rand-forest__max_depth': 2}

In [118]:
#Training Accuracy of Final Model
grids.score(X_train, y_train)

0.8493852459016393

In [119]:
#Testing Accuracy of Final Model
grids.score(X_test, y_test)

0.8446526151444185

In [120]:
#Precision Score of Final Model
precision_score(y_test, y_pred, average='weighted')

0.817844217430801

## Step 8: Fairness Analysis

Here our two groups are:
- carries: positions with more than 3 kills
- non-carries: positions with less than 3 kills

Note: we chose 3 as the threshold because that is the mean kills of all positions. 

- Null Hypothesis: Our model is fair. Its precision for carries and non-carries are roughly the same, and any differences are due to random chance.
- Alternative Hypothesis: Our model is unfair. Its precision for carries is lower than its precision for non-carries.

#### Permutation Test using Precision 

- In our data, having a non-carry labeled as a carry (False Positive) is worse than having a carry labeled as a non-carry (False Negative)

In [121]:
#create a df based on X_test for our permutation test
results = X_test

#append out y_pred and y_test to our results dataframe
results['prediction'] = y_pred
results['tag'] = y_test
thresh_carry= results['kills'].mean()
binar = Binarizer(threshold=3)
results['is_carry'] = binar.transform([results['kills']]).reshape(-1,1)
results

Unnamed: 0,champion,kills,total damage dealt,wards placed,...,exp at 10,prediction,tag,is_carry
129942,Viego,2,5890.0,4.0,...,3362.0,jng,jng,0
89769,Bard,0,7471.0,35.0,...,2274.0,sup,sup,0
61317,Leona,2,3210.0,40.0,...,2474.0,sup,sup,0
...,...,...,...,...,...,...,...,...,...
95983,Twisted Fate,1,4677.0,6.0,...,4455.0,mid,mid,0
64305,Leona,0,2030.0,45.0,...,2709.0,sup,sup,0
105749,Camille,3,7701.0,11.0,...,4704.0,top,top,0


In [122]:
#function that calculates precision
compute_precision = lambda x: precision_score(x['tag'], x['prediction'], average='weighted')


In [123]:
#observed precision
obs_prec = results.groupby('is_carry').apply(compute_precision).diff().iloc[-1]
obs_prec

0.0029886350624001023

In [124]:
#perform permutations test

diff_in_prec = []
for _ in range(500):
    s = (
        results[['is_carry', 'prediction', 'tag']]
        .assign(is_carry=np.random.permutation(results['is_carry']))
        .groupby('is_carry')
        .apply(compute_precision)
        .diff()
        .iloc[-1]
    )
    
    diff_in_prec.append(s)

#### Conclusion

- Based on the graph below, our model does not achieve precision parity :(

In [125]:
#plot the distribution 
fig = pd.Series(diff_in_prec).plot(kind='hist', histnorm='probability', nbins=50,
                            title='Difference in Precision (Carry vs. Non-Carry)')
fig.add_vline(x=obs_prec, line_color='red', line_width=1, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed Precision = {round(obs_prec, 4)}</span>', showarrow= False, y=0.095)
fig.update_layout(xaxis_range=[-0.02, 0.02])
fig.update_layout(yaxis_range=[0, 0.1])
fig.update_layout(legend=None)
fig.show()

In [126]:
#gets our p-value
pval_prec = (pd.Series(diff_in_prec) >= obs_prec).mean()
pval_prec

0.32