# League of Legends Vision Score Statistical Analysis

**Name(s)**: Adrian Kong and Borngreat Omoma-Edosa

**Website Link**: https://realmabg.github.io/League-of-Legends-data-analysis/

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Binarizer, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

from tqdm import tqdm

import plotly.express as px
pd.options.plotting.backend = 'plotly'

import plotly.express as px
import plotly.figure_factory as ff

pd.set_option("display.max_columns", None)

from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

In [8]:
# How effeective is having a higher vision score than having the other team in getting kills


## Step 2: Data Cleaning and Exploratory Data Analysis

In [9]:
#Gets data from each year

data = pd.DataFrame()

for x in np.arange(2014, 2026):

    csv_name = f"data/{x}_LoL_esports_match_data_from_OraclesElixir.csv"

    df = pd.read_csv(csv_name)

    data = pd.concat([data, df])

    







Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.



In [None]:
# CHooses which columns we want

vision_columns = ["gameid","side","assists","result",'wardsplaced', 'wpm', 'wardskilled', 'wcpm', "kills",
       'controlwardsbought', 'visionscore', 'vspm',"position","gamelength","year","url","league","datacompleteness"]

In [11]:
vision_data = data.copy()
vision_data = vision_data[vision_columns]

KeyboardInterrupt: 

In [10]:
vision_data

Unnamed: 0,gameid,side,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,position,gamelength,year,url,league,datacompleteness
0,TRLH3/33,Blue,13,1,13.0,0.41,0.0,0.00,3,0.0,0.0,0.00,top,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
1,TRLH3/33,Blue,14,1,12.0,0.37,0.0,0.00,0,1.0,0.0,0.00,jng,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
2,TRLH3/33,Blue,7,1,12.0,0.37,3.0,0.09,10,0.0,0.0,0.00,mid,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17529,LOLTMNT03_201606,Red,10,1,52.0,1.62,11.0,0.34,3,13.0,114.0,3.54,sup,1931,2025,,LPLOL,complete
17530,LOLTMNT03_201606,Blue,41,0,100.0,3.11,34.0,1.06,13,23.0,213.0,6.62,team,1931,2025,,LPLOL,complete
17531,LOLTMNT03_201606,Red,40,1,93.0,2.89,39.0,1.21,18,26.0,252.0,7.83,team,1931,2025,,LPLOL,complete


In [None]:
# Only gets team data

team_vision_data = vision_data.copy()
team_vision_data = team_vision_data[team_vision_data["position"]=="team"]
team_vision_data = team_vision_data

In [None]:
# Removes irrelevant/NA data

team_vision_data = team_vision_data[team_vision_data["visionscore"].isna() == False]
team_vision_data = team_vision_data[team_vision_data["visionscore"] > 0]

In [None]:
# Makes more_vision and more_kills columns (explained in website)

max_vision = team_vision_data.groupby('gameid')['visionscore'].transform('max')


team_vision_data['more_vision'] = (team_vision_data['visionscore'] == max_vision).astype(int)

for gameid, group in team_vision_data.groupby('gameid'):
        max_kills = group['visionscore'].max()
        if (group['visionscore'] == max_kills).sum() > 1:
            team_vision_data.loc[group.index, 'more_vision'] = 0  



max_vision = team_vision_data.groupby('gameid')['kills'].transform('max')


team_vision_data['more_kills'] = (team_vision_data['kills'] == max_vision).astype(int)

for gameid, group in team_vision_data.groupby('gameid'):
        max_kills = group['kills'].max()
        if (group['kills'] == max_kills).sum() > 1:
            team_vision_data.loc[group.index, 'more_kills'] = 0


In [None]:
team_vision_data.columns

In [None]:
# Changes certain columns to boolean type

team_vision_data["result"] = team_vision_data["result"].astype("bool")
team_vision_data["more_vision"] = team_vision_data["more_vision"].astype("bool")
team_vision_data["more_kills"] = team_vision_data["more_kills"].astype("bool")

In [None]:
# Checks column types

for column in team_vision_data.columns:
    print(type(team_vision_data[column].iloc[0]))

In [None]:
team_vision_data.head()

## Step 2: Univariate Analysis

In [None]:
# Makes histogram for distribution of team's kills

fig = px.histogram(
    team_vision_data, 
    x="kills", 
    nbins=100,  
    title="Distribution of Team's Total Kills",
    labels={"kills": "Number of Kills"},
    opacity=0.75,  
    color_discrete_sequence=["steelblue"]  
)

fig.update_layout(
    bargap=0.1, 
    xaxis_title="Kills",
    yaxis_title="Frequency",
    template="plotly_white"  
)

fig.show()

fig.write_html('univariate_graph_team_kills.html', include_plotlyjs='cdn')

In [None]:
# Makes histogram for distribution of team's visionscore

fig = px.histogram(
    team_vision_data, 
    x="visionscore", 
    nbins=100,  
    title="Distribution of Team's Vision Score",
    labels={"visionscore": "Number of Kills"},
    opacity=0.75,  
    color_discrete_sequence=["steelblue"]  
)

fig.update_layout(
    bargap=0.1, 
    xaxis_title="Vision score",
    yaxis_title="Frequency",
    template="plotly_white"  
)

fig.show()

fig.write_html('univariate_graph_visionscore.html', include_plotlyjs='cdn')

## Step 2: Bivariate Analysis

In [None]:
wins_df = team_vision_data[team_vision_data["more_vision"] == 1]


counts = wins_df["more_kills"].value_counts()

labelling = {True: "Team gets more kills", False: "Team gets less kills"}  


new_index = []
for i in counts.index:
    if isinstance(i, bool):
        new_index.append(labelling[i])
    else:
        new_index.append(i)

counts.index = new_index



biv1 = px.pie(values=counts.values, names=counts.index, title="Does the team get more kills when they have more vision?")

biv1.show()

In [None]:
# Makes a graph about teams winning when they have more vision

wins_df = team_vision_data[team_vision_data["more_vision"] == 1]


counts = wins_df["result"].value_counts()

labelling = {True: "Win", False: "Loss"}  


new_index = []
for i in counts.index:
    if isinstance(i, bool):
        new_index.append(labelling[i])
    else:
        new_index.append(i)

counts.index = new_index


color_map = {"Win": "green", "Loss": "red"}

biv1 = px.pie(
    values=counts.values,
    names=counts.index,
    title="Does a team win when they have more vision?",
    color=counts.index,  # Color by the labels (Win, Loss)
    color_discrete_map=color_map  # Map Win to green and Loss to red
)


# biv1 = px.pie(values=counts.values, names=counts.index, title="Does a team win when they have more vision?")

biv1.show()

fig.write_html('bivariate_result_vision.html', include_plotlyjs='cdn')

## Step 2: Interesting Aggregates

In [None]:
# sum of stats based on team with more kills


agg = team_vision_data.groupby("more_kills").sum()

agg= agg.drop(columns=["gameid","side","position","url","league","datacompleteness","year","gamelength"])

agg

In [None]:
# sum of stats based on team with more vision

agg = team_vision_data.groupby("more_vision").sum()

agg= agg.drop(columns=["gameid","side","position","url","league","datacompleteness","year","gamelength"])

agg



## Step 3: Assessment of Missingness

In [None]:
pd.reset_option('display.max_rows')

In [None]:
team_vision_data[team_vision_data["url"].isna()]

In [None]:
# make a new column that says if there is a url

team_vision_data["url_missing"] = team_vision_data["url"].isna()

In [None]:
team_vision_data

In [None]:
# counts based on year and url missing

url_pivot1 = team_vision_data.pivot_table(index='url_missing', columns='year', aggfunc='size',fill_value=0)
url_pivot1

In [None]:
#total counts

url_pivot2 = pd.pivot_table(team_vision_data,index="url_missing",values="league",aggfunc=len, fill_value=0)
url_pivot2

In [None]:

url_pivot = url_pivot1.div(url_pivot2['league'], axis=0).T

In [None]:
tvd_observed = url_pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2
tvd_observed

In [None]:
# proportion table, along with making the format better



pd.options.display.float_format = '{:.5f}'.format
def format_value(x):
    if abs(x) < 1e-6:  # Adjust the threshold as needed
        return '0'
    else:
        return '{:.5f}'.format(x)

url_pivot = url_pivot.applymap(format_value)
url_pivot








In [None]:
#format for markdown

markdown_table = url_pivot.copy()

markdown_table = markdown_table.rename(columns={False: f'url_missing = False', True: f'url_missing = True'})

markdown_table = markdown_table.to_markdown()

pd.set_option('display.max_rows', None)

print(markdown_table)



In [None]:
pd.reset_option('display.max_rows')

In [None]:
# Null Hypothesis: Distribution of year when url is missing is the same as the distribution of year when url is not missing.

# Alternative Hypothesis: Distribution of year when url is missing is NOT same as the distribution of year when url is not missing.

# sample stat: 0.855


In [None]:
#permutation

In [None]:
smaller_df = team_vision_data.copy()
smaller_df = smaller_df[["year","url_missing"]]
smaller_df

In [None]:
tvd_stats = []

for _ in np.arange(100):
    smaller_df["url_shuffled"] = np.random.permutation(smaller_df["url_missing"])
    pivoted = (
        smaller_df
        .pivot_table(index='url_shuffled', columns='year', aggfunc='size',fill_value=0)
    )
    
    
    permutated_table = pivoted.div(url_pivot2['league'], axis=0).T
    
    tvd = permutated_table.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvd_stats.append(tvd)

tvd_stats

In [None]:
fig = px.histogram(pd.DataFrame(tvd_stats), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=tvd_observed, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(tvd_observed, 2)}</span>',
                   x=2.5 * tvd_observed, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

In [None]:
(np.array(tvd_stats) >= tvd_observed).mean()

In [None]:
url_pivot

In [None]:
url_pivot.plot(kind='barh', title='Gender by Missingness of Child Height (MCAR Example)', barmode='group')

In [None]:
#reject the null 

In [None]:
team_vision_data

In [None]:
def helper(column):
    print(column)
    pivot1 = team_vision_data.pivot_table(index='url_missing', columns=f'{column}', aggfunc='size',fill_value=0)
    pivot2 = pd.pivot_table(team_vision_data,index="url_missing",values=f'{column}',aggfunc=len, fill_value=0)
    pivot = pivot1.div(pivot2[column], axis=0).T
    observed_tvd = pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2

    df_smaller = team_vision_data.copy()
    df_smaller = df_smaller[["url_missing",column]]

    tvd_stats2 = []

    for _ in np.arange(1000):
        df_smaller["url_shuffled"] = np.random.permutation(df_smaller["url_missing"])
        pivoted = (
        df_smaller
        .pivot_table(index='url_shuffled', columns=f'{column}', aggfunc='size',fill_value=0)
    )
    
    
        permutated_table = pivoted.div(pivot2[column], axis=0).T
    
        tvd = permutated_table.diff(axis=1).iloc[:, -1].abs().sum() / 2
        tvd_stats2.append(tvd)

    return (column, (np.array(tvd_stats2) >= observed_tvd).mean())


In [None]:
pivot1 = team_vision_data.pivot_table(index='url_missing', columns= "more_vision", aggfunc='size',fill_value=0)
pivot2 = pd.pivot_table(team_vision_data,index="url_missing",values= "more_vision",aggfunc=len, fill_value=0)
pivot = pivot1.div(pivot2["more_vision"], axis=0).T
observed_tvd = pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2

df_smaller = team_vision_data.copy()
df_smaller = df_smaller[["url_missing","more_vision"]]


tvd_stats2 = []

for _ in np.arange(1000):
        df_smaller["url_shuffled"] = np.random.permutation(df_smaller["url_missing"])
        pivoted = (
        df_smaller
        .pivot_table(index='url_shuffled', columns="more_vision", aggfunc='size',fill_value=0)
)
    
    
        permutated_table = pivoted.div(pivot2["more_vision"], axis=0).T
    
        tvd = permutated_table.diff(axis=1).iloc[:, -1].abs().sum() / 2
        tvd_stats2.append(tvd)

(np.array(tvd_stats2) >= observed_tvd).mean()


In [None]:
pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2

In [None]:
helper("more_vision")

In [None]:
# Null Hypothesis: Distribution of vision score when url is missing is the same as the distribution of vision score when url is not missing.

# Alternative Hypothesis: Distribution of vision score when url is missing is NOT same as the distribution of vision score when url is not missing.

# sample stat: 0.0015

# Fail to reject

In [None]:
fig = px.histogram(pd.DataFrame(tvd_stats2), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 4)}</span>',
                   x=2.5 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

In [None]:
pivot

## Step 4: Hypothesis Testing

Null hyphotesis: The distribution of kills for a team with the higher vision score in a game is the same as the team that has the lower vision score.

Alternate Hyphotesis: The distribution of kills for the team with the higher vision score is NOT the same as the team that has the lower vision score.

In [None]:

# 

# Null: The distribution of kills for a team with the higher vision score in a game is the same as the team that has the lower vision score.


# Alternate: The distribution of kills for the team with the higher vision score is NOT the same as the team that has the lower vision score.


# Absolute mean difference between kills in teams with higher vision and kills in teams with lower vision,

#test statistic: 0.4150174636183205

In [None]:
pivot1 = team_vision_data.pivot_table(index='kills', columns= "more_vision", aggfunc='size',fill_value=0)
pivot1 = pivot1 / pivot1.sum()
observed_tvd = pivot1.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd
pivot1 = team_vision_data.pivot_table(index='kills', columns= "more_vision", aggfunc='size',fill_value=0)
pivot1 = pivot1 / pivot1.sum()
observed_tvd = pivot1.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

In [None]:




df_smaller = team_vision_data.copy()
df_smaller = df_smaller[["more_vision","kills"]]


tvd_stats3 = []

for _ in np.arange(1000):
        df_smaller["vision_shuffled"] = np.random.permutation(df_smaller["more_vision"])
        pivoted = (
        df_smaller
        .pivot_table(index='kills', columns="vision_shuffled", aggfunc='size',fill_value=0)
)
        pivoted = pivoted / pivoted.sum()

    
        tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
        tvd_stats3.append(tvd)



print(f"P-value: {(np.array(tvd_stats) >= tvd_observed).mean():.10f}")


In [None]:
fig = px.histogram(pd.DataFrame(tvd_stats3), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 4)}</span>',
                   x=2.5 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

## Step 5: Framing a Prediction Problem

Can we accurately predict a team's vision score based solely on their in-game performance statistics?

For our prediction model, we will perform necessary preprocessing steps such as dropping non-informative or metadata columns like gameid and url. This ensures that our model leverages only the relevant in-game statistics.

To address this question, we will frame the problem as a regression task where the vision score is treated as a continuous variable. Our dataset includes the following columns:
assists, result, wardsplaced, wpm, wardskilled, wcpm, kills, controlwardsbought, visionscore, gamelength, more_kills, and more_vision

To mitigate overfitting, the data will be split into 75% training and 25% test sets. Our model’s performance will be evaluated using regression metrics such as Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and the R² score. These metrics will help us understand the predictive accuracy and the variance explained by our model.

At the time of prediction, the only available information will be the in-game performance statistics (e.g., assists, wards placed, ward kills, kills, control wards bought, etc.), allowing the model to generate an estimated vision score. This predictive insight can then be used to further understand a player’s contribution to vision control and overall team strategy.

By addressing this prediction problem, we aim to quantify the impact of in-game performance on vision score, providing a valuable tool for game analysis and strategic planning in League of Legends.

In [None]:

predict_df = team_vision_data.drop(columns=['side', 'year', 'league', 'url','datacompleteness','position','vspm','gameid','url_missing'])


## Step 6: Baseline Model

For the baseline model, we used a linear regression, with the following features wardsplaced, wpm, wardskilled, wcpm, controlwardsbought. My assumption is that the most direct inputs to a vision score are the actions related to placing and managing wards. The features are quantitative. We utilized StandardScaler Transformer to transform them into standard scale, becasue each match has different time length, and therefore the statistics could seem really different without being standardized. 

We also used Polynomial Features to fine a hyperameter that best fit the model. After fitting the model, our R squared score on the training data was 0.9102. Though our accuracy is high the RSME on the training data was 18.3220, which is not very good. Our R squared score on the test data was 0.9078, which means our model has low variance. The RSME on the test data was 22.4480. Our model still has large improvement space, and we will improve it by adding more features and using a random forest regressor, and tuning hyperparameters in the next section because it will capture complex, non-linear interactions without needing to manually generate polynomial features.

In [None]:
predict_df.head()

In [None]:
X = predict_df.drop(columns=['assists','result','kills','visionscore','gamelength','more_vision','more_kills'], axis=1)
y = predict_df['visionscore']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Determining the hyperparameter

errs_df = pd.DataFrame()

for d in tqdm(range(1, 6)):
    pl= make_pipeline(
        StandardScaler(),
        PolynomialFeatures(d),
        LinearRegression(),
    )
    
    errs = cross_val_score(pl, X_train, y_train, 
                           cv=KFold(5, shuffle=True, random_state=1), scoring='neg_root_mean_squared_error')
    errs_df[f'Deg {d}'] = -errs # Negate to turn positive (sklearn computed negative RMSE).
    
errs_df.index = [f'Fold {i}' for i in range(1, 6)]
errs_df.index.name = 'Validation Fold'

In [None]:
errs_df

In [None]:
errs_df.mean().idxmin()

In [None]:
basline_model = make_pipeline(
        StandardScaler(),
        PolynomialFeatures(4),
        LinearRegression(),
    )


In [None]:
basline_model.fit(X_train, y_train)

In [None]:
basline_model.score(X_train, y_train)

In [None]:
basline_model.score(X_test, y_test)

In [None]:
root_mean_squared_error(y_train, basline_model.predict(X_train))

In [None]:
root_mean_squared_error(y_test, basline_model.predict(X_test))

## Step 7: Final Model

In our Final model, we shifted from a polynomial linear regression approach to a Random Forest regressor to better capture complex non-linear interactions among our features. Our dataset includes both categorical variables (such as result, more_vision, and more_kills) and quantitative features (like assists, wardsplaced, wpm, wardskilled, wcpm, kills, controlwardsbought, gamelength, among others). We used a preprocessing pipeline where the categorical features were transformed using OneHotEncoder (with the first category dropped) and the numerical features were standardized using StandardScaler. This ensures that differences in match duration and varying scales among features do not skew the model's performance.

To improve model performance, we implemented hyperparameter tuning using GridSearchCV. We set up a grid that explored combinations of three key hyperparameters for the Random Forest regressor: the number of trees (n_estimators), the maximum depth of the trees (max_depth), and the minimum number of samples required to split a node (min_samples_split). The grid search was conducted with 5-fold cross-validation (with shuffling enabled for more robust sampling) and used negative root mean squared error (RMSE) as the scoring metric.

Given the size of our dataset (approximately 100,000 rows), we opted to perform the initial hyperparameter tuning on a smaller subset (50,000 rows). This subset allowed us to efficiently search for the best hyperparameters without the extensive computational time required for the full dataset. On this subset, the grid search identified the best hyperparameters as follows: max_depth of 10, min_samples_split of 5, and n_estimators of 200, resulting in a cross-validated RMSE of around 19.0761 and a test RMSE of approximately 18.9855.

With these promising results from the subset, we applied the tuned hyperparameters to a pipeline re-fitted on the full training data. This approach should leverage the model's ability to capture non-linearities and complex feature interactions, ultimately enhancing the prediction of the team's vision score compared to our baseline model. Using the best parameter the test dataset RMSE was 18.8683, the train dataset R^2 was 0.9438, and the test R^2 was 0.9348

## Random forest 

## using a subset of the data

In [None]:
# A dictionary that maps names to Pipeline objects.
n=3
select = FunctionTransformer(lambda x: x)
pipes = {
    'wardsplaced + wardskilled': make_pipeline(
        make_column_transformer( (select, ['wardsplaced', 'wardskilled']) ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
    'wardsplaced + wardskilled + controlwardsbought + gamelength': make_pipeline(
        make_column_transformer( (select, ['wardsplaced', 'wardskilled','controlwardsbought','gamelength']) ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
    'all ward + controlwardsbought + gamelength': make_pipeline(
        make_column_transformer( (select, ['wardsplaced', 'wardskilled','controlwardsbought','gamelength','wcpm','wpm']) ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
    'All columns': make_pipeline(
       make_column_transformer(
           (OneHotEncoder(drop='first'), ['result', 'more_vision', 'more_kills']),
           remainder='passthrough',
           force_int_remainder_cols=False,
           ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
}

In [None]:
pipe_df = pd.DataFrame()

for pipe in pipes:
    errs = cross_val_score(pipes[pipe], X_train, y_train,
                           cv=KFold(5, shuffle=True, random_state=1), scoring='neg_root_mean_squared_error')
    pipe_df[pipe] = -errs
    
pipe_df.index = [f'Fold {i}' for i in range(1, 6)]
pipe_df.index.name = 'Validation Fold'

In [None]:
pipe_df

In [None]:
pipe_df.mean()

In [None]:
# Sample a smaller subset (e.g., 10,000 rows) from your dataset
subset_df = predict_df.sample(n=50000, random_state=1)

# Define features and target variable for the subset
X_sub = subset_df.drop('visionscore', axis=1)
y_sub = subset_df['visionscore']

# Split the subset into training and testing sets
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_sub, y_sub, test_size=0.2, random_state=1)

# Define categorical and numerical columns
categorical_cols = ['result', 'more_vision', 'more_kills']
numerical_cols = [col for col in X_sub.columns if col not in categorical_cols]


In [None]:
# Create a preprocessor for both categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

# Create a pipeline that applies preprocessing and then fits a Random Forest regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=1))
])

# Define a grid of hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 5, 10],
    'regressor__min_samples_split': [2, 5, 10]
}



In [None]:
# Set up GridSearchCV with 5-fold cross-validation and negative RMSE scoring
grid_search_sub = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=1
)

# Fit GridSearchCV on the subset's training data
grid_search_sub.fit(X_train_sub, y_train_sub)



In [None]:
# Output the best hyperparameters and corresponding cross-validated RMSE for the subset
print("Best parameters on subset:", grid_search_sub.best_params_)
print("Best CV RMSE on subset:", -grid_search_sub.best_score_)

# Evaluate the best model from the subset on its test set
y_pred_sub = grid_search_sub.predict(X_test_sub)
test_rmse_sub = np.sqrt(mean_squared_error(y_test_sub, y_pred_sub))
print("Test RMSE on subset:", test_rmse_sub)

# After tuning on the subset, you can apply these best parameters to a new pipeline 
# and re-fit on the full training data if desired.

## Using the full dataset

In [None]:
# Assuming 'predict_df' is your DataFrame
# Define features and target variable
X = predict_df.drop('visionscore', axis=1)
y = predict_df['visionscore']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Identify categorical and numerical columns
categorical_cols = ['result', 'more_vision', 'more_kills']
numerical_cols = [col for col in X.columns if col not in categorical_cols]


In [None]:
# Preprocessing pipeline: apply OneHotEncoder for categorical features and StandardScaler for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

# Create a pipeline that first preprocesses the data then applies a Random Forest regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=1))
])



In [None]:
# Define a grid of hyperparameters for the Random Forest
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 5, 10],
    'regressor__min_samples_split': [2, 5, 10]
}

# Set up GridSearchCV with 5-fold cross-validation and negative RMSE scoring


cv_strategy = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=1
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)



In [None]:
# Output the best hyperparameters and corresponding cross-validated RMSE
print("Best parameters found:", grid_search.best_params_)
print("Train RMSE:", -grid_search.score(X_train, y_train))

# Evaluate the best model on the test set
y_pred = grid_search.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE:", test_rmse)

print("Train R^2:", r2_score(y_train, grid_search.predict(X_train)))

print("Test R^2:",r2_score(y_test, grid_search.predict(X_test)) )


## Step 8: Fairness Analysis

In this section, we are going to assess if our model is fair among different groups. The question we are trying to answer here is: “does my model perform worse on teams that lost than it does for teams that won ?” To answer this question, we performed a permutation test and examined the result of the difference in accuracy between the two groups.

The group X represents the teams that lost, and group Y represents the teams that won. Our evaluation metric is the difference in the R^2 between each group, and the significance level is 0.05.

The followings are our hypothesis:

Null hypothesis: Our model is fair. Its accuracy for teams that lost is same as the accuracy for teams that won.

Alternative hypothesis: Our model is unfair. Its accuracy for teams that lost is NOT the same as the accuracy for teams that won.

Test statistics: The difference in R^2 between the groups.

After performing the permutation test, the result p-value we got is 1.0, which is larger than the 0.05 significance level. Consequently, we fail to reject the null hypothesis. This outcome implies that our model predicts players from both groups with statistically similar accuracy levels. Consequently, our model appears to be fair, exhibiting no discernible bias towards one group over the other based on the specified criteria.

In [None]:
# Preprocessing pipeline: apply OneHotEncoder for categorical features and StandardScaler for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

# Create a pipeline that first preprocesses the data then applies a Random Forest regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=100, random_state=1))
])


In [None]:
pipeline.fit(predict_df.drop(columns=['visionscore']), predict_df['visionscore'])

In [None]:
y_pred = pipeline.predict(predict_df.drop(columns=['visionscore']))

In [None]:
fair_analy = predict_df.copy()

fair_analy['vision_pred'] =  y_pred

fair_analy = fair_analy[['result','vision_pred','visionscore']]

In [None]:
fair_analy

In [None]:
n_repetitions = 500

# Step 0: Compute the observed difference
# Compute R² for each group using the original predictions.
group_r2_obs = fair_analy.groupby('result').apply(
    lambda group: r2_score(group['visionscore'], group['vision_pred']),
    include_groups=False
)
observed_diff = abs(group_r2_obs.loc[True] - group_r2_obs.loc[False])
print("Observed difference:", observed_diff)

# Step 1: Permutation test
differences = []
for _ in range(n_repetitions):
    # Shuffle the predictions and assign to a new column.
    with_shuffled = fair_analy.assign(
        Shuffled_vision=np.random.permutation(fair_analy['vision_pred'])
    )
    
    # Compute the R² for each group using the shuffled predictions.
    group_r2 = with_shuffled.groupby('result').apply(
        lambda group: r2_score(group['visionscore'], group['Shuffled_vision']),
        include_groups=False
    )
    
    # Calculate the absolute difference between the R² scores.
    diff = abs(group_r2.loc[True] - group_r2.loc[False])
    differences.append(diff)

# Convert list to numpy array for easier comparison.
differences = np.array(differences)

# Step 2: Compute the p-value
# p-value is the fraction of permutations where the permuted difference
# is as large or larger than the observed difference.
p_value = np.mean(differences >= observed_diff)
print("p-value:", p_value)
