In [None]:
# Load the packages

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data. 
# EPL results for the 2020/21 season

PLeague2021 = pd.read_csv('/kaggle/input/epl2021-games/EPL_20212020_DATOS.csv')
print(PLeague2021.columns.tolist())

In [None]:
#Assign new columns that capture home wins and away wins for every team
PLeague2021['hwinvalue']=np.where(PLeague2021['FTR']=='H',1,np.where(PLeague2021['FTR']=='D',.5,0))
PLeague2021['awinvalue']=np.where(PLeague2021['FTR']=='A',1,np.where(PLeague2021['FTR']=='D',.5,0))
PLeague2021['count']=1

In [None]:
#in order to make the prediction, first divide the dataset into two halfs, 2020 and 2021 games 
EPL2020 =PLeague2021[PLeague2021.Date < 20210000]
EPL2020.describe()

In [None]:
EPL2021 =PLeague2021[PLeague2021.Date > 20210000]
EPL2021.describe()

In [None]:
#Group the data set by Team and in two different dataframes, Home & Away, capturing for each one the number of wins, goals scored(FTHG) and goals against(FTAG)
EPL2020home = EPL2020.groupby(['HomeTeam'])['count','hwinvalue', 'FTHG','FTAG'].sum().reset_index()
EPL2020home = EPL2020home.rename(columns={'HomeTeam':'team','count':'Ph','FTHG':'FTHGh','FTAG':'FTAGh'})
EPL2020home

In [None]:
EPL2020away = EPL2020.groupby('AwayTeam')['count','awinvalue', 'FTHG','FTAG'].sum().reset_index()
EPL2020away = EPL2020away.rename(columns={'AwayTeam':'team','count':'Pa','FTHG':'FTHGa','FTAG':'FTAGa'})
EPL2020away

In [None]:
#merge together both datasets by team
EnglishPL2020 = pd.merge(EPL2020home, EPL2020away, on = ['team'])
EnglishPL2020

In [None]:
#Create Win (W) Games (G) Goals in Favor (GF) and Goals Against (GA) by adding values 
EnglishPL2020['W'] = EnglishPL2020['hwinvalue']+EnglishPL2020['awinvalue']
EnglishPL2020['G'] = EnglishPL2020['Ph']+EnglishPL2020['Pa']
EnglishPL2020['GF'] = EnglishPL2020['FTHGh']+EnglishPL2020['FTAGa']
EnglishPL2020['GA'] = EnglishPL2020['FTAGh']+EnglishPL2020['FTHGa']
EnglishPL2020

In [None]:
#using Bill James Pythagorean Expectation, applied to football, capture the expected win ratio 'pyth'
EnglishPL2020['wpc2020'] = EnglishPL2020['W']/EnglishPL2020['G']
EnglishPL2020['pyth2020'] = EnglishPL2020['GF']**2/(EnglishPL2020['GF']**2 + EnglishPL2020['GA']**2)
EnglishPL2020
sns.relplot(x="pyth2020", y="wpc2020", data =EnglishPL2020)

In [None]:
#Regression model for seeing r squared
pyth_lm = smf.ols(formula = 'wpc2020 ~ pyth2020', data=EnglishPL2020).fit()
pyth_lm.summary()

In [None]:
#Repeating the same process but with the second half of the data set, games in 2021
EPL2021home = EPL2021.groupby(['HomeTeam'])['count','hwinvalue', 'FTHG','FTAG'].sum().reset_index()
EPL2021home = EPL2021home.rename(columns={'HomeTeam':'team','count':'Ph','FTHG':'FTHGh','FTAG':'FTAGh'})
EPL2021home

In [None]:
EPL2021away = EPL2021.groupby('AwayTeam')['count','awinvalue', 'FTHG','FTAG'].sum().reset_index()
EPL2021away = EPL2021away.rename(columns={'AwayTeam':'team','count':'Pa','FTHG':'FTHGa','FTAG':'FTAGa'})
EPL2021away

In [None]:
EnglishPL2021 = pd.merge(EPL2021home, EPL2021away, on = ['team'])
EnglishPL2021

In [None]:
EnglishPL2021['W'] = EnglishPL2021['hwinvalue']+EnglishPL2021['awinvalue']
EnglishPL2021['G'] = EnglishPL2021['Ph']+EnglishPL2021['Pa']
EnglishPL2021['GF'] = EnglishPL2021['FTHGh']+EnglishPL2021['FTAGa']
EnglishPL2021['GA'] = EnglishPL2021['FTAGh']+EnglishPL2021['FTHGa']
EnglishPL2021

In [None]:
EnglishPL2021['wpc2021'] = EnglishPL2021['W']/EnglishPL2021['G']
EnglishPL2021['pyth2021'] = EnglishPL2021['GF']**2/(EnglishPL2021['GF']**2 + EnglishPL2021['GA']**2)
EnglishPL2021
sns.relplot(x="pyth2021", y="wpc2021", data =EnglishPL2021)

In [None]:
pyth_lm = smf.ols(formula = 'wpc2021 ~ pyth2021', data=EnglishPL2021).fit()
pyth_lm.summary()

In [None]:
#For last, merging the win percentage and pythagorean expectation for 2021 in order to compare the correlation
Half2predictor = pd.merge(EnglishPL2020,EnglishPL2021[['team','wpc2021','pyth2021']], on='team')
Half2predictor

In [None]:
#Finding the correlation between expectation and Actual win percentage, you may find that 'Pyth2020' has a stronger correlation 
#with Win percentage (wpc2021) than 'wpc2020'. this means 'pyth' acts as a better predictor than the win percentage!
keyvars = Half2predictor[['team','wpc2020','wpc2021','pyth2020','pyth2021']]
keyvars.corr()

In [None]:
sns.relplot(x="pyth2020", y="wpc2021", data =Half2predictor)

In [None]:
sns.relplot(x="wpc2020", y="wpc2021", data =Half2predictor)

In [None]:
#The complete result of the ratios for every team, with this you can evaluate which team overperformed or underperformed
keyvars = keyvars.sort_values(by=['wpc2021'],ascending=False)
keyvars

In [None]:
keyvars.to_csv('Predicted2021PL.csv')