# League of Legends Vision Score Statistical Analysis

**Name(s)**: Adrian Kong and Borngreat Omoma-Edosa

**Website Link**: https://realmabg.github.io/League-of-Legends-data-analysis/

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Binarizer, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

from tqdm import tqdm

import plotly.express as px
pd.options.plotting.backend = 'plotly'

import plotly.express as px
import plotly.figure_factory as ff

pd.set_option("display.max_columns", None)

# from dsc80_utils import * # Feel free to uncomment and use this.

## Step 1: Introduction

In [2]:
# How effeective is having a higher vision score than having the other team in getting kills


## Step 2: Data Cleaning and Exploratory Data Analysis

In [3]:
data = pd.DataFrame()

for x in np.arange(2014, 2026):

    csv_name = f"data/{x}_LoL_esports_match_data_from_OraclesElixir.csv"

    df = pd.read_csv(csv_name)

    data = pd.concat([data, df])

    




#2025 = pd.read_csv("drive-download-20250227T021614Z-001/2025_LoL_esports_match_data_from_OraclesElixir.csv")

  df = pd.read_csv(csv_name)
  df = pd.read_csv(csv_name)
  df = pd.read_csv(csv_name)
  df = pd.read_csv(csv_name)


In [4]:
vision_columns = ["gameid","side","assists","result",'wardsplaced', 'wpm', 'wardskilled', 'wcpm', "kills",
       'controlwardsbought', 'visionscore', 'vspm',"position","gamelength","year","url","league","datacompleteness"]

In [5]:
vision_data = data.copy()
vision_data = vision_data[vision_columns]

In [6]:
vision_data

Unnamed: 0,gameid,side,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,position,gamelength,year,url,league,datacompleteness
0,TRLH3/33,Blue,13,1,13.0,0.4054,0.0,0.0000,3,0.0,0.0,0.0000,top,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
1,TRLH3/33,Blue,14,1,12.0,0.3742,0.0,0.0000,0,1.0,0.0,0.0000,jng,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
2,TRLH3/33,Blue,7,1,12.0,0.3742,3.0,0.0936,10,0.0,0.0,0.0000,mid,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
3,TRLH3/33,Blue,9,1,6.0,0.1871,1.0,0.0312,8,0.0,0.0,0.0000,bot,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
4,TRLH3/33,Blue,15,1,38.0,1.1850,2.0,0.0624,0,3.0,0.0,0.0000,sup,1924,2014,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,complete
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18703,LOLTMNT05_115052,Red,9,1,7.0,0.2785,3.0,0.1194,6,4.0,19.0,0.7560,mid,1508,2025,,LEC,complete
18704,LOLTMNT05_115052,Red,8,1,9.0,0.3581,4.0,0.1592,6,1.0,16.0,0.6366,bot,1508,2025,,LEC,complete
18705,LOLTMNT05_115052,Red,14,1,32.0,1.2732,10.0,0.3979,0,10.0,72.0,2.8647,sup,1508,2025,,LEC,complete
18706,LOLTMNT05_115052,Blue,8,0,70.0,2.7851,25.0,0.9947,5,21.0,163.0,6.4854,team,1508,2025,,LEC,complete


In [7]:
team_vision_data = vision_data.copy()
team_vision_data = team_vision_data[team_vision_data["position"]=="team"]
team_vision_data = team_vision_data

In [8]:
team_vision_data = team_vision_data[team_vision_data["visionscore"].isna() == False]
team_vision_data = team_vision_data[team_vision_data["visionscore"] > 0]

In [9]:
max_vision = team_vision_data.groupby('gameid')['visionscore'].transform('max')


team_vision_data['more_vision'] = (team_vision_data['visionscore'] == max_vision).astype(int)

for gameid, group in team_vision_data.groupby('gameid'):
        max_kills = group['visionscore'].max()
        if (group['visionscore'] == max_kills).sum() > 1:
            team_vision_data.loc[group.index, 'more_vision'] = 0  # Set more_kills to 0 for the entire group



max_vision = team_vision_data.groupby('gameid')['kills'].transform('max')


team_vision_data['more_kills'] = (team_vision_data['kills'] == max_vision).astype(int)

for gameid, group in team_vision_data.groupby('gameid'):
        max_kills = group['kills'].max()
        if (group['kills'] == max_kills).sum() > 1:
            team_vision_data.loc[group.index, 'more_kills'] = 0


In [10]:
team_vision_data.columns

Index(['gameid', 'side', 'assists', 'result', 'wardsplaced', 'wpm',
       'wardskilled', 'wcpm', 'kills', 'controlwardsbought', 'visionscore',
       'vspm', 'position', 'gamelength', 'year', 'url', 'league',
       'datacompleteness', 'more_vision', 'more_kills'],
      dtype='object')

In [11]:
team_vision_data["result"] = team_vision_data["result"].astype("bool")
team_vision_data["more_vision"] = team_vision_data["more_vision"].astype("bool")
team_vision_data["more_kills"] = team_vision_data["more_kills"].astype("bool")

In [12]:
for column in team_vision_data.columns:
    print(type(team_vision_data[column].iloc[0]))

<class 'str'>
<class 'str'>
<class 'numpy.int64'>
<class 'numpy.bool'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.int64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'str'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'numpy.bool'>
<class 'numpy.bool'>


In [13]:
team_vision_data.head()

Unnamed: 0,gameid,side,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,position,gamelength,year,url,league,datacompleteness,more_vision,more_kills
32650,ESPORTSTMNT02/180233,Blue,60,True,151.0,3.7956,47.0,1.1814,21,31.0,304.0,7.6414,team,2387,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,True
32651,ESPORTSTMNT02/180233,Red,23,False,139.0,3.4939,57.0,1.4328,13,35.0,359.0,9.0239,team,2387,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,True,False
32662,ESPORTSTMNT02/180242,Blue,11,False,99.0,3.3712,25.0,0.8513,9,23.0,188.0,6.4018,team,1762,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,False
32663,ESPORTSTMNT02/180242,Red,53,True,117.0,3.9841,31.0,1.0556,24,29.0,254.0,8.6493,team,1762,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,True,True
32674,ESPORTSTMNT02/190231,Blue,53,False,160.0,3.6309,45.0,1.0212,24,50.0,313.0,7.1029,team,2644,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,True


## Step 2: Univariate Analysis

In [14]:
team_vision_data["assists"].mean()

np.float64(32.00633818974418)

In [15]:
univ1= team_vision_data["kills"].plot(kind="hist")

univ1

In [16]:
univ2 = team_vision_data["visionscore"].plot(kind="hist")
univ2

## Step 2: Bivariate Analysis

In [17]:
wins_df = team_vision_data[team_vision_data["more_vision"] == 1]


counts = wins_df["more_kills"].value_counts()

labelling = {True: "Team gets more kills", False: "Team gets less kills"}  


new_index = []
for i in counts.index:
    if isinstance(i, bool):
        new_index.append(labelling[i])
    else:
        new_index.append(i)

counts.index = new_index



biv1 = px.pie(values=counts.values, names=counts.index, title="Does the team get more kills when they have more vision?")

biv1.show()

In [18]:
wins_df = team_vision_data[team_vision_data["more_vision"] == 1]


counts = wins_df["result"].value_counts()

labelling = {True: "Win", False: "Loss"}  


new_index = []
for i in counts.index:
    if isinstance(i, bool):
        new_index.append(labelling[i])
    else:
        new_index.append(i)

counts.index = new_index



biv1 = px.pie(values=counts.values, names=counts.index, title="Does a team win when they have more vision?")

biv1.show()

## Step 2: Interesting Aggregates

In [19]:
agg = team_vision_data.groupby("more_kills").sum()

agg= agg.drop(columns=["gameid","side","position","url","league","datacompleteness","year","gamelength"])

agg

Unnamed: 0_level_0,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,more_vision
more_kills,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,1640619,8650,7688377.0,235736.4008,3275115.0,99351.3983,745694,2861813.0,17332521.0,529337.5094,18809
True,2823369,61080,6591050.0,203992.1847,2972468.0,91234.6862,1224014,2437620.0,15971211.0,494300.6281,47424


In [20]:
agg = team_vision_data.groupby("more_vision").sum()

agg= agg.drop(columns=["gameid","side","position","url","league","datacompleteness","year","gamelength"])

agg

Unnamed: 0_level_0,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,more_kills
more_vision,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,1828726,17876,7292085.0,224213.4473,3123721.0,94981.8224,813533,2737145.0,16217025.0,497115.5061,16115
True,2635262,51854,6987342.0,215515.1382,3123862.0,95604.2621,1156175,2562288.0,17086707.0,526522.6314,47424


## Step 3: Assessment of Missingness

In [21]:
team_vision_data[team_vision_data["url"].isna()]

Unnamed: 0,gameid,side,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,position,gamelength,year,url,league,datacompleteness,more_vision,more_kills
56362,42891-51227,Blue,81,True,182.0,3.5862,66.0,1.3005,32,76.0,412.0,8.1182,team,3045,2018,,LPL,complete,True,True
56363,42891-51227,Red,55,False,169.0,3.3300,66.0,1.3005,23,51.0,383.0,7.5468,team,3045,2018,,LPL,complete,False,False
70354,42929-52581,Blue,3,False,64.0,2.5430,19.0,0.7550,2,29.0,136.0,5.4040,team,1510,2018,,LPL,complete,False,False
70355,42929-52581,Red,33,True,76.0,3.0199,24.0,0.9536,13,36.0,169.0,6.7152,team,1510,2018,,LPL,complete,True,True
71686,42738-52904,Blue,24,True,149.0,4.6684,49.0,1.5352,12,46.0,280.0,8.7728,team,1915,2018,,LPL,complete,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18683,LOLTMNT05_115046,Red,53,True,121.0,3.4261,66.0,1.8688,23,37.0,315.0,8.9193,team,2119,2025,,LEC,complete,True,True
18694,LOLTMNT05_116037,Blue,19,False,129.0,3.6892,61.0,1.7445,9,43.0,315.0,9.0086,team,2098,2025,,LEC,complete,False,False
18695,LOLTMNT05_116037,Red,47,True,127.0,3.6320,67.0,1.9161,23,47.0,329.0,9.4090,team,2098,2025,,LEC,complete,True,True
18706,LOLTMNT05_115052,Blue,8,False,70.0,2.7851,25.0,0.9947,5,21.0,163.0,6.4854,team,1508,2025,,LEC,complete,False,False


In [22]:
team_vision_data["url_missing"] = team_vision_data["url"].isna()

In [23]:
team_vision_data

Unnamed: 0,gameid,side,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,position,gamelength,year,url,league,datacompleteness,more_vision,more_kills,url_missing
32650,ESPORTSTMNT02/180233,Blue,60,True,151.0,3.7956,47.0,1.1814,21,31.0,304.0,7.6414,team,2387,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,True,False
32651,ESPORTSTMNT02/180233,Red,23,False,139.0,3.4939,57.0,1.4328,13,35.0,359.0,9.0239,team,2387,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,True,False,False
32662,ESPORTSTMNT02/180242,Blue,11,False,99.0,3.3712,25.0,0.8513,9,23.0,188.0,6.4018,team,1762,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,False,False
32663,ESPORTSTMNT02/180242,Red,53,True,117.0,3.9841,31.0,1.0556,24,29.0,254.0,8.6493,team,1762,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,True,True,False
32674,ESPORTSTMNT02/190231,Blue,53,False,160.0,3.6309,45.0,1.0212,24,50.0,313.0,7.1029,team,2644,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18683,LOLTMNT05_115046,Red,53,True,121.0,3.4261,66.0,1.8688,23,37.0,315.0,8.9193,team,2119,2025,,LEC,complete,True,True,True
18694,LOLTMNT05_116037,Blue,19,False,129.0,3.6892,61.0,1.7445,9,43.0,315.0,9.0086,team,2098,2025,,LEC,complete,False,False,True
18695,LOLTMNT05_116037,Red,47,True,127.0,3.6320,67.0,1.9161,23,47.0,329.0,9.4090,team,2098,2025,,LEC,complete,True,True,True
18706,LOLTMNT05_115052,Blue,8,False,70.0,2.7851,25.0,0.9947,5,21.0,163.0,6.4854,team,1508,2025,,LEC,complete,False,False,True


In [24]:
url_pivot1 = pd.pivot_table(team_vision_data,index="url_missing",columns="year",values="gameid",aggfunc=len, fill_value=0)
url_pivot1

year,2017,2018,2019,2020,2021,2022,2023,2024,2025
url_missing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,4256,10488,13334,18736,21816,4334,3518,2746,260
True,0,12,190,0,286,20592,18480,16838,3586


In [25]:
team_vision_data.pivot_table(index='url_missing', columns='year', aggfunc='size',fill_value=0)

year,2017,2018,2019,2020,2021,2022,2023,2024,2025
url_missing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,4256,10488,13334,18736,21816,4334,3518,2746,260
True,0,12,190,0,286,20592,18480,16838,3586


In [26]:
url_pivot2 = pd.pivot_table(team_vision_data,index="url_missing",values="league",aggfunc=len, fill_value=0)
url_pivot2

Unnamed: 0_level_0,league
url_missing,Unnamed: 1_level_1
False,79488
True,59984


In [27]:
url_pivot = url_pivot1.div(url_pivot2['league'], axis=0).T
url_pivot

url_missing,False,True
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,0.053543,0.0
2018,0.131944,0.0002
2019,0.167749,0.003168
2020,0.235709,0.0
2021,0.274457,0.004768
2022,0.054524,0.343292
2023,0.044258,0.308082
2024,0.034546,0.280708
2025,0.003271,0.059783


In [28]:
# Null Hypothesis: Distribution of year when url is missing is the same as the distribution of year when url is not missing.

# Alternative Hypothesis: Distribution of year when url is missing is NOT same as the distribution of year when url is not missing.

# sample stat: 0.855


In [29]:
tvd_observed = url_pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2
tvd_observed

np.float64(0.8552652620945832)

In [30]:
#permutation

In [31]:
smaller_df = team_vision_data.copy()
smaller_df = smaller_df[["year","url_missing"]]
smaller_df

Unnamed: 0,year,url_missing
32650,2017,False
32651,2017,False
32662,2017,False
32663,2017,False
32674,2017,False
...,...,...
18683,2025,True
18694,2025,True
18695,2025,True
18706,2025,True


In [32]:
tvd_stats = []

for _ in np.arange(100):
    smaller_df["url_shuffled"] = np.random.permutation(smaller_df["url_missing"])
    pivoted = (
        smaller_df
        .pivot_table(index='url_shuffled', columns='year', aggfunc='size',fill_value=0)
    )
    
    
    permutated_table = pivoted.div(url_pivot2['league'], axis=0).T
    
    tvd = permutated_table.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvd_stats.append(tvd)

tvd_stats

[np.float64(0.007474060984163698),
 np.float64(0.004101633892494792),
 np.float64(0.004503747295575096),
 np.float64(0.00763830231271548),
 np.float64(0.00893347793979628),
 np.float64(0.005710241866966975),
 np.float64(0.005602728628868922),
 np.float64(0.004890964751094101),
 np.float64(0.007349799452693562),
 np.float64(0.004512700300326982),
 np.float64(0.006075153991681714),
 np.float64(0.0074979099364768775),
 np.float64(0.006781669556326128),
 np.float64(0.005607127950169443),
 np.float64(0.006795793693132983),
 np.float64(0.0065715054878831836),
 np.float64(0.004618978641217894),
 np.float64(0.006260774478132426),
 np.float64(0.006986894035940428),
 np.float64(0.006204972560584034),
 np.float64(0.007088927417681757),
 np.float64(0.00431048588265511),
 np.float64(0.00769958408662063),
 np.float64(0.005786573950584337),
 np.float64(0.004392452184780125),
 np.float64(0.00678660914515475),
 np.float64(0.0056794466178635185),
 np.float64(0.0034567088260573156),
 np.float64(0.0058892

In [33]:
fig = px.histogram(pd.DataFrame(tvd_stats), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=tvd_observed, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(tvd_observed, 2)}</span>',
                   x=2.5 * tvd_observed, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

In [34]:
(np.array(tvd_stats) >= tvd_observed).mean()

np.float64(0.0)

In [35]:
url_pivot

url_missing,False,True
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,0.053543,0.0
2018,0.131944,0.0002
2019,0.167749,0.003168
2020,0.235709,0.0
2021,0.274457,0.004768
2022,0.054524,0.343292
2023,0.044258,0.308082
2024,0.034546,0.280708
2025,0.003271,0.059783


In [36]:
url_pivot.plot(kind='barh', title='Gender by Missingness of Child Height (MCAR Example)', barmode='group')

In [37]:
#reject the null 

In [38]:
team_vision_data

Unnamed: 0,gameid,side,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,vspm,position,gamelength,year,url,league,datacompleteness,more_vision,more_kills,url_missing
32650,ESPORTSTMNT02/180233,Blue,60,True,151.0,3.7956,47.0,1.1814,21,31.0,304.0,7.6414,team,2387,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,True,False
32651,ESPORTSTMNT02/180233,Red,23,False,139.0,3.4939,57.0,1.4328,13,35.0,359.0,9.0239,team,2387,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,True,False,False
32662,ESPORTSTMNT02/180242,Blue,11,False,99.0,3.3712,25.0,0.8513,9,23.0,188.0,6.4018,team,1762,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,False,False
32663,ESPORTSTMNT02/180242,Red,53,True,117.0,3.9841,31.0,1.0556,24,29.0,254.0,8.6493,team,1762,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,True,True,False
32674,ESPORTSTMNT02/190231,Blue,53,False,160.0,3.6309,45.0,1.0212,24,50.0,313.0,7.1029,team,2644,2017,http://matchhistory.na.leagueoflegends.com/en/...,EU CS,complete,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18683,LOLTMNT05_115046,Red,53,True,121.0,3.4261,66.0,1.8688,23,37.0,315.0,8.9193,team,2119,2025,,LEC,complete,True,True,True
18694,LOLTMNT05_116037,Blue,19,False,129.0,3.6892,61.0,1.7445,9,43.0,315.0,9.0086,team,2098,2025,,LEC,complete,False,False,True
18695,LOLTMNT05_116037,Red,47,True,127.0,3.6320,67.0,1.9161,23,47.0,329.0,9.4090,team,2098,2025,,LEC,complete,True,True,True
18706,LOLTMNT05_115052,Blue,8,False,70.0,2.7851,25.0,0.9947,5,21.0,163.0,6.4854,team,1508,2025,,LEC,complete,False,False,True


In [39]:
def helper(column):
    print(column)
    pivot1 = team_vision_data.pivot_table(index='url_missing', columns=f'{column}', aggfunc='size',fill_value=0)
    pivot2 = pd.pivot_table(team_vision_data,index="url_missing",values=f'{column}',aggfunc=len, fill_value=0)
    pivot = pivot1.div(pivot2[column], axis=0).T
    observed_tvd = pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2

    df_smaller = team_vision_data.copy()
    df_smaller = df_smaller[["url_missing",column]]

    tvd_stats2 = []

    for _ in np.arange(1000):
        df_smaller["url_shuffled"] = np.random.permutation(df_smaller["url_missing"])
        pivoted = (
        df_smaller
        .pivot_table(index='url_shuffled', columns=f'{column}', aggfunc='size',fill_value=0)
    )
    
    
        permutated_table = pivoted.div(pivot2[column], axis=0).T
    
        tvd = permutated_table.diff(axis=1).iloc[:, -1].abs().sum() / 2
        tvd_stats2.append(tvd)

    return (column, (np.array(tvd_stats2) >= observed_tvd).mean())


In [40]:
pivot1 = team_vision_data.pivot_table(index='url_missing', columns= "more_vision", aggfunc='size',fill_value=0)
pivot2 = pd.pivot_table(team_vision_data,index="url_missing",values= "more_vision",aggfunc=len, fill_value=0)
pivot = pivot1.div(pivot2["more_vision"], axis=0).T
observed_tvd = pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2

df_smaller = team_vision_data.copy()
df_smaller = df_smaller[["url_missing","more_vision"]]


tvd_stats2 = []

for _ in np.arange(1000):
        df_smaller["url_shuffled"] = np.random.permutation(df_smaller["url_missing"])
        pivoted = (
        df_smaller
        .pivot_table(index='url_shuffled', columns="more_vision", aggfunc='size',fill_value=0)
)
    
    
        permutated_table = pivoted.div(pivot2["more_vision"], axis=0).T
    
        tvd = permutated_table.diff(axis=1).iloc[:, -1].abs().sum() / 2
        tvd_stats2.append(tvd)

(np.array(tvd_stats2) >= observed_tvd).mean()


np.float64(0.553)

In [41]:
pivot.diff(axis=1).iloc[:, -1].abs().sum() / 2

np.float64(0.0015961818213252044)

In [42]:
helper("more_vision")

more_vision


('more_vision', np.float64(0.529))

In [43]:
# Null Hypothesis: Distribution of vision score when url is missing is the same as the distribution of vision score when url is not missing.

# Alternative Hypothesis: Distribution of vision score when url is missing is NOT same as the distribution of vision score when url is not missing.

# sample stat: 0.0015

# Fail to reject

In [44]:
fig = px.histogram(pd.DataFrame(tvd_stats2), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 4)}</span>',
                   x=2.5 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

In [45]:
pivot

url_missing,False,True
more_vision,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.525803,0.524206
True,0.474197,0.475794


## Step 4: Hypothesis Testing

Null hyphotesis: The distribution of kills for a team with the higher vision score in a game is the same as the team that has the lower vision score.

Alternate Hyphotesis: The distribution of kills for the team with the higher vision score is NOT the same as the team that has the lower vision score.

In [46]:

# 

# Null: The distribution of kills for a team with the higher vision score in a game is the same as the team that has the lower vision score.


# Alternate: The distribution of kills for the team with the higher vision score is NOT the same as the team that has the lower vision score.


# Absolute mean difference between kills in teams with higher vision and kills in teams with lower vision,

#test statistic: 0.4150174636183205

In [47]:
pivot1 = team_vision_data.pivot_table(index='kills', columns= "more_vision", aggfunc='size',fill_value=0)
pivot1 = pivot1 / pivot1.sum()
observed_tvd = pivot1.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd
pivot1 = team_vision_data.pivot_table(index='kills', columns= "more_vision", aggfunc='size',fill_value=0)
pivot1 = pivot1 / pivot1.sum()
observed_tvd = pivot1.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

np.float64(0.4150604266424662)

In [48]:




df_smaller = team_vision_data.copy()
df_smaller = df_smaller[["more_vision","kills"]]


tvd_stats3 = []

for _ in np.arange(1000):
        df_smaller["vision_shuffled"] = np.random.permutation(df_smaller["more_vision"])
        pivoted = (
        df_smaller
        .pivot_table(index='kills', columns="vision_shuffled", aggfunc='size',fill_value=0)
)
        pivoted = pivoted / pivoted.sum()

    
        tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
        tvd_stats3.append(tvd)



print(f"P-value: {(np.array(tvd_stats) >= tvd_observed).mean():.10f}")


P-value: 0.0000000000


In [49]:
fig = px.histogram(pd.DataFrame(tvd_stats3), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 4)}</span>',
                   x=2.5 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

## Step 5: Framing a Prediction Problem

Can we accurately predict a team's vision score based solely on their in-game performance statistics?

For our prediction model, we will perform necessary preprocessing steps such as dropping non-informative or metadata columns like gameid and url. This ensures that our model leverages only the relevant in-game statistics.

To address this question, we will frame the problem as a regression task where the vision score is treated as a continuous variable. Our dataset includes the following columns:
assists, result, wardsplaced, wpm, wardskilled, wcpm, kills, controlwardsbought, visionscore, gamelength, more_kills, and more_vision

To mitigate overfitting, the data will be split into 75% training and 25% test sets. Our model’s performance will be evaluated using regression metrics such as Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and the R² score. These metrics will help us understand the predictive accuracy and the variance explained by our model.

At the time of prediction, the only available information will be the in-game performance statistics (e.g., assists, wards placed, ward kills, kills, control wards bought, etc.), allowing the model to generate an estimated vision score. This predictive insight can then be used to further understand a player’s contribution to vision control and overall team strategy.

By addressing this prediction problem, we aim to quantify the impact of in-game performance on vision score, providing a valuable tool for game analysis and strategic planning in League of Legends.

In [50]:

predict_df = team_vision_data.drop(columns=['side', 'year', 'league', 'url','datacompleteness','position','vspm','gameid','url_missing'])


In [51]:
# TODO

## Step 6: Baseline Model

For the baseline model, we used a linear regression, with the following features wardsplaced, wpm, wardskilled, wcpm, controlwardsbought. The features are quantitative. We utilized StandardScaler Transformer to transform them into standard scale, becasue each match has different time length, and therefore the statistics could seem really different without being standardized. 

We also used Polynomial Features to fine a hyperameter that best fit the model. After fitting the model, our R squared score on the training data was 0.9102. Though our accuracy is high the RSME on the training data was 18.3220, which is not very good. Our R squared score on the test data was 0.9078, which means our model has low variance. The RSME on the test data was 22.4480. Our model still has large improvement space, and we will improve it by adding more features and using a random forest regressor, and tuning hyperparameters in the next section because it will capture complex, non-linear interactions without needing to manually generate polynomial features.

In [132]:
predict_df.head()

Unnamed: 0,assists,result,wardsplaced,wpm,wardskilled,wcpm,kills,controlwardsbought,visionscore,gamelength,more_vision,more_kills
32650,60,True,151.0,3.7956,47.0,1.1814,21,31.0,304.0,2387,False,True
32651,23,False,139.0,3.4939,57.0,1.4328,13,35.0,359.0,2387,True,False
32662,11,False,99.0,3.3712,25.0,0.8513,9,23.0,188.0,1762,False,False
32663,53,True,117.0,3.9841,31.0,1.0556,24,29.0,254.0,1762,True,True
32674,53,False,160.0,3.6309,45.0,1.0212,24,50.0,313.0,2644,False,True


In [134]:
X = predict_df.drop(columns=['assists','result','kills','visionscore','gamelength','more_vision','more_kills'], axis=1)
y = predict_df['visionscore']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [136]:
# Determining the hyperparameter

errs_df = pd.DataFrame()

for d in tqdm(range(1, 6)):
    pl= make_pipeline(
        StandardScaler(),
        PolynomialFeatures(d),
        LinearRegression(),
    )
    
    errs = cross_val_score(pl, X_train, y_train, 
                           cv=KFold(5, shuffle=True, random_state=1), scoring='neg_root_mean_squared_error')
    errs_df[f'Deg {d}'] = -errs # Negate to turn positive (sklearn computed negative RMSE).
    
errs_df.index = [f'Fold {i}' for i in range(1, 6)]
errs_df.index.name = 'Validation Fold'

100%|██████████| 5/5 [00:06<00:00,  1.32s/it]


In [137]:
errs_df

Unnamed: 0_level_0,Deg 1,Deg 2,Deg 3,Deg 4,Deg 5
Validation Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fold 1,25.53156,22.697743,22.351847,22.307914,22.353565
Fold 2,26.062158,22.911334,22.74387,22.510859,29.204442
Fold 3,25.484909,22.56247,22.208148,22.150119,22.223564
Fold 4,25.871469,22.901414,22.543306,22.492548,29.488179
Fold 5,26.344988,23.030381,22.820117,22.736594,23.299791


In [138]:
errs_df.mean().idxmin()

'Deg 4'

In [139]:
basline_model = make_pipeline(
        StandardScaler(),
        PolynomialFeatures(4),
        LinearRegression(),
    )


In [140]:
basline_model.fit(X_train, y_train)

In [141]:
basline_model.score(X_train, y_train)

0.910282839765318

In [142]:
basline_model.score(X_test, y_test)

0.9077513702464067

In [None]:
root_mean_squared_error(y_train, basline_model.predict(X_train))

22.357646563721072

In [None]:
root_mean_squared_error(y_test, basline_model.predict(X_test))

22.447998976399123

## Step 7: Final Model

In our Final model, we shifted from a polynomial linear regression approach to a Random Forest regressor to better capture complex non-linear interactions among our features. Our dataset includes both categorical variables (such as result, more_vision, and more_kills) and quantitative features (like assists, wardsplaced, wpm, wardskilled, wcpm, kills, controlwardsbought, gamelength, among others). We used a preprocessing pipeline where the categorical features were transformed using OneHotEncoder (with the first category dropped) and the numerical features were standardized using StandardScaler. This ensures that differences in match duration and varying scales among features do not skew the model's performance.

To improve model performance, we implemented hyperparameter tuning using GridSearchCV. We set up a grid that explored combinations of three key hyperparameters for the Random Forest regressor: the number of trees (n_estimators), the maximum depth of the trees (max_depth), and the minimum number of samples required to split a node (min_samples_split). The grid search was conducted with 5-fold cross-validation (with shuffling enabled for more robust sampling) and used negative root mean squared error (RMSE) as the scoring metric.

Given the size of our dataset (approximately 100,000 rows), we opted to perform the initial hyperparameter tuning on a smaller subset (50,000 rows). This subset allowed us to efficiently search for the best hyperparameters without the extensive computational time required for the full dataset. On this subset, the grid search identified the best hyperparameters as follows: max_depth of 10, min_samples_split of 5, and n_estimators of 200, resulting in a cross-validated RMSE of around 19.0761 and a test RMSE of approximately 18.9855.

With these promising results from the subset, we applied the tuned hyperparameters to a pipeline re-fitted on the full training data. This approach should leverage the model's ability to capture non-linearities and complex feature interactions, ultimately enhancing the prediction of the team's vision score compared to our baseline model. Using the best parameter the test dataset RMSE was 18.8683, the train dataset R^2 was 0.9438, and the test R^2 was 0.9348

## Random forest 

## using a subset of the data

In [None]:
# A dictionary that maps names to Pipeline objects.
n=3
select = FunctionTransformer(lambda x: x)
pipes = {
    'wardsplaced + wardskilled': make_pipeline(
        make_column_transformer( (select, ['wardsplaced', 'wardskilled']) ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
    'wardsplaced + wardskilled + controlwardsbought + gamelength': make_pipeline(
        make_column_transformer( (select, ['wardsplaced', 'wardskilled','controlwardsbought','gamelength']) ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
    'all ward + controlwardsbought + gamelength': make_pipeline(
        make_column_transformer( (select, ['wardsplaced', 'wardskilled','controlwardsbought','gamelength','wcpm','wpm']) ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
    'All columns': make_pipeline(
       make_column_transformer(
           (OneHotEncoder(drop='first'), ['result', 'more_vision', 'more_kills']),
           remainder='passthrough',
           force_int_remainder_cols=False,
           ),
        StandardScaler(),
        PolynomialFeatures(n),
        LinearRegression(),
    ),
}

In [None]:
pipe_df = pd.DataFrame()

for pipe in pipes:
    errs = cross_val_score(pipes[pipe], X_train, y_train,
                           cv=KFold(5, shuffle=True, random_state=1), scoring='neg_root_mean_squared_error')
    pipe_df[pipe] = -errs
    
pipe_df.index = [f'Fold {i}' for i in range(1, 6)]
pipe_df.index.name = 'Validation Fold'

In [None]:
pipe_df

Unnamed: 0_level_0,wardsplaced + wardskilled,wardsplaced + wardskilled + controlwardsbought + gamelength,all ward + controlwardsbought + gamelength,All columns
Validation Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fold 1,25.521366,22.487644,22.331426,18.323781
Fold 2,26.328485,22.939697,22.652021,18.778464
Fold 3,25.36842,22.342301,22.184071,18.288474
Fold 4,26.030754,22.71755,22.557773,18.582842
Fold 5,26.119618,22.922634,22.788181,18.754932


In [None]:
pipe_df.mean()

wardsplaced + wardskilled                                      25.873729
wardsplaced + wardskilled + controlwardsbought + gamelength    22.681965
all ward + controlwardsbought + gamelength                     22.502694
All columns                                                    18.545699
dtype: float64

In [None]:
# Sample a smaller subset (e.g., 10,000 rows) from your dataset
subset_df = predict_df.sample(n=50000, random_state=1)

# Define features and target variable for the subset
X_sub = subset_df.drop('visionscore', axis=1)
y_sub = subset_df['visionscore']

# Split the subset into training and testing sets
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_sub, y_sub, test_size=0.2, random_state=1)

# Define categorical and numerical columns
categorical_cols = ['result', 'more_vision', 'more_kills']
numerical_cols = [col for col in X_sub.columns if col not in categorical_cols]


In [None]:
# Create a preprocessor for both categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

# Create a pipeline that applies preprocessing and then fits a Random Forest regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=1))
])

# Define a grid of hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 5, 10],
    'regressor__min_samples_split': [2, 5, 10]
}



In [None]:
# Set up GridSearchCV with 5-fold cross-validation and negative RMSE scoring
grid_search_sub = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=1
)

# Fit GridSearchCV on the subset's training data
grid_search_sub.fit(X_train_sub, y_train_sub)



In [None]:
# Output the best hyperparameters and corresponding cross-validated RMSE for the subset
print("Best parameters on subset:", grid_search_sub.best_params_)
print("Best CV RMSE on subset:", -grid_search_sub.best_score_)

# Evaluate the best model from the subset on its test set
y_pred_sub = grid_search_sub.predict(X_test_sub)
test_rmse_sub = np.sqrt(mean_squared_error(y_test_sub, y_pred_sub))
print("Test RMSE on subset:", test_rmse_sub)

# After tuning on the subset, you can apply these best parameters to a new pipeline 
# and re-fit on the full training data if desired.

Best parameters on subset: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Best CV RMSE on subset: 19.07612500045631
Test RMSE on subset: 18.985596255988252


## Using the full dataset

In [115]:
# Assuming 'predict_df' is your DataFrame
# Define features and target variable
X = predict_df.drop('visionscore', axis=1)
y = predict_df['visionscore']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Identify categorical and numerical columns
categorical_cols = ['result', 'more_vision', 'more_kills']
numerical_cols = [col for col in X.columns if col not in categorical_cols]


In [116]:
# Preprocessing pipeline: apply OneHotEncoder for categorical features and StandardScaler for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

# Create a pipeline that first preprocesses the data then applies a Random Forest regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=1))
])



In [119]:
# Define a grid of hyperparameters for the Random Forest
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 5, 10],
    'regressor__min_samples_split': [2, 5, 10]
}

# Set up GridSearchCV with 5-fold cross-validation and negative RMSE scoring


cv_strategy = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv_strategy,
    scoring='neg_root_mean_squared_error',
    n_jobs=1
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)



In [131]:
# Output the best hyperparameters and corresponding cross-validated RMSE
print("Best parameters found:", grid_search.best_params_)
print("Train RMSE:", -grid_search.score(X_train, y_train))

# Evaluate the best model on the test set
y_pred = grid_search.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE:", test_rmse)

print("Train R^2:", r2_score(y_train, grid_search.predict(X_train)))

print("Test R^2:",r2_score(y_test, grid_search.predict(X_test)) )


Best parameters found: {'regressor__max_depth': 10, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Train RMSE: 17.68767436714722
Test RMSE: 18.868267591608635
Train R^2: 0.9438480536765359
Test R^2: 0.9348268503513951


## Step 8: Fairness Analysis

In [None]:
# TODO