In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RidgeCV, SGDRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, cross_validate
from sklearn.neural_network import MLPRegressor
import scoring_utils
import warnings
# warnings.filterwarnings("ignore")

In [2]:
feature_coef = pd.read_csv('https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/model1/feature_coefficients.csv')
for col in feature_coef.columns:
    print((col, feature_coef[col]))

('Unnamed: 0', 0    0
Name: Unnamed: 0, dtype: int64)
('Sweeper AvgDist', 0   -0.012394
Name: Sweeper AvgDist, dtype: float64)
('Passes AvgLen', 0    0.005435
Name: Passes AvgLen, dtype: float64)
('Tackles Tkl', 0    0.000444
Name: Tackles Tkl, dtype: float64)
('Tackles TklW', 0   -0.000284
Name: Tackles TklW, dtype: float64)
('Tackles Def 3rd', 0   -0.000317
Name: Tackles Def 3rd, dtype: float64)
('Tackles Mid 3rd', 0    0.000581
Name: Tackles Mid 3rd, dtype: float64)
('Tackles Att 3rd', 0    0.00018
Name: Tackles Att 3rd, dtype: float64)
('Challenges Tkl', 0    0.00469
Name: Challenges Tkl, dtype: float64)
('Challenges Att', 0    0.000083
Name: Challenges Att, dtype: float64)
('Challenges Tkl%', 0   -0.074273
Name: Challenges Tkl%, dtype: float64)
('Challenges Lost', 0   -0.004607
Name: Challenges Lost, dtype: float64)
('Blocks Blocks', 0   -0.000977
Name: Blocks Blocks, dtype: float64)
('Blocks Sh', 0   -0.000335
Name: Blocks Sh, dtype: float64)
('Blocks Pass', 0   -0.000641
Name: B

In [3]:
coefficient_cols = feature_coef.columns.to_list()

In [4]:
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-defense-player.csv'
df1 = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-gca-player.csv'
df2 = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-misc-player.csv'
df3 = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-passing-player.csv'
df4 = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-playingtime-player.csv'
df5 = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-possession-player.csv'
df6 = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-shooting-player.csv'
df7 = pd.read_csv(url)

In [5]:
print(df1.columns)
print(df2.columns)
print(df3.columns)
print(df4.columns)
print(df5.columns)
print(df6.columns)
print(df7.columns)

Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', '90s',
       'Tackles Tkl', 'Tackles TklW', 'Tackles Def 3rd', 'Tackles Mid 3rd',
       'Tackles Att 3rd', 'Challenges Tkl', 'Challenges Att',
       'Challenges Tkl%', 'Challenges Lost', 'Blocks Blocks', 'Blocks Sh',
       'Blocks Pass', 'Int', 'Tkl+Int', 'Clr', 'Err'],
      dtype='object')
Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', '90s',
       'SCA SCA', 'SCA SCA90', 'SCA Types PassLive', 'SCA Types PassDead',
       'SCA Types TO', 'SCA Types Sh', 'SCA Types Fld', 'SCA Types Def',
       'GCA GCA', 'GCA GCA90', 'GCA Types PassLive', 'GCA Types PassDead',
       'GCA Types TO', 'GCA Types Sh', 'GCA Types Fld', 'GCA Types Def'],
      dtype='object')
Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', '90s',
       'Performance CrdY', 'Performance CrdR', 'Performance 2CrdY',
       'Performance Fls', 'Performance Fld', 'Performance Off',
       'Performance Crs', 'Performance Int', '

In [6]:
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-keepers-player.csv'
df8 = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/scrapping/data/Premier-League-Stats/2021-2022/player/Premier-League-Stats-2021-2022-keepersadv-player.csv'
df9 = pd.read_csv(url)

In [7]:
print(df8.columns)
print(df9.columns)

Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born',
       'Playing Time MP', 'Playing Time Starts', 'Playing Time Min',
       'Playing Time 90s', 'Performance GA', 'Performance GA90',
       'Performance SoTA', 'Performance Saves', 'Performance Save%',
       'Performance W', 'Performance D', 'Performance L', 'Performance CS',
       'Performance CS%', 'Penalty Kicks PKatt', 'Penalty Kicks PKA',
       'Penalty Kicks PKsv', 'Penalty Kicks PKm', 'Penalty Kicks Save%'],
      dtype='object')
Index(['Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', '90s',
       'Goals GA', 'Goals PKA', 'Goals FK', 'Goals CK', 'Goals OG',
       'Expected PSxG', 'Expected PSxG/SoT', 'Expected PSxG+/-',
       'Expected /90', 'Launched Cmp', 'Launched Att', 'Launched Cmp%',
       'Passes Att (GK)', 'Passes Thr', 'Passes Launch%', 'Passes AvgLen',
       'Goal Kicks Att', 'Goal Kicks Launch%', 'Goal Kicks AvgLen',
       'Crosses Opp', 'Crosses Stp', 'Crosses Stp%', 'Sweeper #OPA',
   

In [8]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)
print(df6.shape)
print(df7.shape)
print(df8.shape)
print(df9.shape)

(546, 24)
(546, 24)
(546, 24)
(546, 31)
(691, 29)
(546, 30)
(546, 25)
(42, 26)
(42, 33)


In [9]:
print(df1.shape[0]-df1[df1['90s']==0].shape[0])
print(df6.shape[0]-df6[df6['90s']==0].shape[0])

536
536


In [10]:
# df1=df1[df1['90s']>0]
# df2=df2[df2['90s']>0]
# df3=df3[df3['90s']>0]
# df4=df4[df4['90s']>0]
# df5=df5[df5['Playing Time 90s']>0]
# df6=df6[df6['90s']>0]
# df7=df7[df7['90s']>0]
# df8=df8[df8['Playing Time 90s']>0]
# df9=df9[df9['90s']>0]

In [11]:
df1 = df1.drop(df1[df1['90s']==0].index)
df2 = df2.drop(df2[df2['90s']==0].index)
df3 = df3.drop(df3[df3['90s']==0].index)
df4 = df4.drop(df4[df4['90s']==0].index)
df5 = df5.drop(df5[df5['Playing Time 90s']==0].index)
df6 = df6.drop(df6[df6['90s']==0].index)
df7 = df7.drop(df7[df7['90s']==0].index)
df8 = df8.drop(df8[df8['Playing Time 90s']==0].index)
df9 = df9.drop(df9[df9['90s']==0].index)

In [12]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)
print(df6.shape)
print(df7.shape)
print(df8.shape)
print(df9.shape)

(536, 24)
(536, 24)
(536, 24)
(536, 31)
(680, 29)
(536, 30)
(536, 25)
(42, 26)
(42, 33)


In [13]:
df1.head(20)

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,90s,Tackles Tkl,Tackles TklW,...,Challenges Att,Challenges Tkl%,Challenges Lost,Blocks Blocks,Blocks Sh,Blocks Pass,Int,Tkl+Int,Clr,Err
0,1,Max Aarons,ENG,DF,Norwich City,21,2000,32.0,64,44,...,55,67.3,18,39,19,20,28,92,96,1
1,2,Che Adams,SCO,FW,Southampton,25,1996,22.7,13,6,...,16,12.5,14,18,2,16,8,21,18,0
2,3,Rayan Aït Nouri,ALG,DF|MF,Wolves,20,2001,20.3,64,36,...,44,50.0,22,24,2,22,22,86,39,1
3,4,Kristoffer Ajer,NOR,DF,Brentford,23,1998,22.2,29,18,...,21,61.9,8,19,11,8,26,55,74,0
4,5,Nathan Aké,NED,DF,Manchester City,26,1995,10.3,16,9,...,8,75.0,2,7,5,2,8,24,23,0
5,6,Marc Albrighton,ENG,FW|DF,Leicester City,31,1989,12.6,31,19,...,47,21.3,37,15,3,12,14,45,20,0
6,7,Thiago Alcántara,ESP,MF,Liverpool,30,1991,17.0,48,29,...,42,23.8,32,22,2,20,26,74,13,2
7,8,Trent Alexander-Arnold,ENG,DF,Liverpool,22,1998,31.7,40,23,...,60,45.0,33,22,4,18,43,83,40,0
8,9,Alisson,BRA,GK,Liverpool,28,1992,36.0,3,1,...,5,60.0,2,1,0,1,0,3,28,1
9,10,Allan,BRA,MF,Everton,30,1991,24.3,74,44,...,105,32.4,71,40,9,31,23,97,39,1


In [14]:
df2.head(20)

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,90s,SCA SCA,SCA SCA90,...,SCA Types Fld,SCA Types Def,GCA GCA,GCA GCA90,GCA Types PassLive,GCA Types PassDead,GCA Types TO,GCA Types Sh,GCA Types Fld,GCA Types Def
0,1,Max Aarons,ENG,DF,Norwich City,21,2000,32.0,46,1.44,...,3,1,3,0.09,3,0,0,0,0,0
1,2,Che Adams,SCO,FW,Southampton,25,1996,22.7,51,2.25,...,11,2,5,0.22,3,0,1,0,0,1
2,3,Rayan Aït Nouri,ALG,DF|MF,Wolves,20,2001,20.3,45,2.22,...,2,1,6,0.3,4,1,0,0,1,0
3,4,Kristoffer Ajer,NOR,DF,Brentford,23,1998,22.2,33,1.5,...,1,1,7,0.32,3,1,2,0,1,0
4,5,Nathan Aké,NED,DF,Manchester City,26,1995,10.3,5,0.49,...,0,0,0,0.0,0,0,0,0,0,0
5,6,Marc Albrighton,ENG,FW|DF,Leicester City,31,1989,12.6,19,1.51,...,3,0,2,0.16,2,0,0,0,0,0
6,7,Thiago Alcántara,ESP,MF,Liverpool,30,1991,17.0,67,3.93,...,1,1,6,0.35,6,0,0,0,0,0
7,8,Trent Alexander-Arnold,ENG,DF,Liverpool,22,1998,31.7,160,5.05,...,1,0,18,0.57,13,4,0,1,0,0
8,9,Alisson,BRA,GK,Liverpool,28,1992,36.0,2,0.06,...,0,0,1,0.03,1,0,0,0,0,0
9,10,Allan,BRA,MF,Everton,30,1991,24.3,38,1.57,...,1,1,7,0.29,5,0,1,0,1,0


In [15]:
df8

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,Playing Time MP,Playing Time Starts,Playing Time Min,...,Performance W,Performance D,Performance L,Performance CS,Performance CS%,Penalty Kicks PKatt,Penalty Kicks PKA,Penalty Kicks PKsv,Penalty Kicks PKm,Penalty Kicks Save%
0,1,Alisson,BRA,GK,Liverpool,28,1992,36,36,3240,...,27,7,2,20,55.6,0,0,0,0,
1,2,Alphonse Areola,FRA,GK,West Ham,28,1993,1,1,90,...,0,0,1,0,0.0,0,0,0,0,
2,3,Kepa Arrizabalaga,ESP,GK,Chelsea,26,1994,4,4,360,...,2,1,1,2,50.0,0,0,0,0,
3,4,Daniel Bachmann,AUT,GK,Watford,27,1994,12,12,1080,...,1,0,11,0,0.0,3,3,0,0,0.0
4,5,Asmir Begović,BIH,GK,Everton,34,1987,3,3,270,...,1,0,2,1,33.3,1,1,0,0,0.0
5,6,Jack Butland,ENG,GK,Crystal Palace,28,1993,9,8,765,...,2,4,2,1,12.5,1,0,1,0,100.0
6,7,Willy Caballero,ARG,GK,Southampton,39,1981,2,2,180,...,0,1,1,0,0.0,0,0,0,0,
7,8,Karl Darlow,ENG,GK,Newcastle Utd,30,1990,8,8,720,...,0,5,3,0,0.0,2,2,0,0,0.0
8,9,Martin Dúbravka,SVK,GK,Newcastle Utd,32,1989,26,26,2340,...,13,4,9,8,30.8,1,1,0,0,0.0
9,10,Ederson,BRA,GK,Manchester City,27,1993,37,37,3330,...,28,6,3,20,54.1,1,1,0,0,0.0


In [16]:
df9

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,90s,Goals GA,Goals PKA,...,Passes AvgLen,Goal Kicks Att,Goal Kicks Launch%,Goal Kicks AvgLen,Crosses Opp,Crosses Stp,Crosses Stp%,Sweeper #OPA,Sweeper #OPA/90,Sweeper AvgDist
0,1,Alisson,BRA,GK,Liverpool,28,1992,36.0,24,0,...,26.3,172,33.1,33.4,340,22,6.5,89,2.47,18.7
1,2,Alphonse Areola,FRA,GK,West Ham,28,1993,1.0,1,0,...,46.3,14,71.4,48.5,15,1,6.7,0,0.0,5.5
2,3,Kepa Arrizabalaga,ESP,GK,Chelsea,26,1994,4.0,2,0,...,26.4,21,28.6,27.6,44,2,4.5,8,2.0,18.8
3,4,Daniel Bachmann,AUT,GK,Watford,27,1994,12.0,28,3,...,40.9,87,67.8,47.8,187,7,3.7,12,1.0,16.1
4,5,Asmir Begović,BIH,GK,Everton,34,1987,3.0,8,1,...,48.9,31,71.0,51.3,50,3,6.0,2,0.67,10.7
5,6,Jack Butland,ENG,GK,Crystal Palace,28,1993,8.5,13,0,...,29.8,66,45.5,36.3,139,11,7.9,7,0.82,11.2
6,7,Willy Caballero,ARG,GK,Southampton,39,1981,2.0,5,0,...,38.4,13,92.3,56.5,33,1,3.0,4,2.0,14.0
7,8,Karl Darlow,ENG,GK,Newcastle Utd,30,1990,8.0,15,2,...,44.5,62,77.4,56.7,106,5,4.7,4,0.5,10.5
8,9,Martin Dúbravka,SVK,GK,Newcastle Utd,32,1989,26.0,35,1,...,37.0,236,91.9,60.5,360,22,6.1,14,0.54,11.9
9,10,Ederson,BRA,GK,Manchester City,27,1993,37.0,26,1,...,25.7,138,19.6,26.8,300,21,7.0,55,1.49,19.0


In [17]:
frames = [df1[['Player', 'Nation', 'Pos', 'Squad', 'Born', 'Tackles Tkl', 'Tackles TklW', 'Tackles Def 3rd', 'Tackles Mid 3rd', 'Tackles Att 3rd', 'Challenges Tkl', 'Challenges Att', 'Challenges Tkl%', 'Challenges Lost', 'Blocks Blocks', 'Blocks Sh', 'Blocks Pass', 'Int', 'Tkl+Int', 'Clr', 'Err']],
          df2[['SCA SCA', 'SCA SCA90', 'SCA Types PassLive', 'SCA Types PassDead', 'SCA Types TO', 'SCA Types Sh', 'SCA Types Fld', 'SCA Types Def', 'GCA GCA', 'GCA GCA90', 'GCA Types PassLive', 'GCA Types PassDead', 'GCA Types TO', 'GCA Types Sh', 'GCA Types Fld', 'GCA Types Def']],
          df3[['Performance CrdY', 'Performance CrdR', 'Performance 2CrdY', 'Performance Fls', 'Performance Fld', 'Performance Off', 'Performance Crs', 'Performance Int', 'Performance TklW', 'Performance PKwon', 'Performance PKcon', 'Performance OG', 'Performance Recov', 'Aerial Duels Won', 'Aerial Duels Lost', 'Aerial Duels Won%']],
          df4[['Total Cmp', 'Total Att', 'Total Cmp%', 'Total TotDist', 'Total PrgDist', 'Short Cmp', 'Short Att', 'Short Cmp%', 'Medium Cmp', 'Medium Att', 'Medium Cmp%', 'Long Cmp', 'Long Att', 'Long Cmp%', 'Ast', 'xAG', 'Expected xA', 'Expected A-xAG', 'KP', '1/3', 'PPA', 'CrsPA', 'PrgP']],
          df5[['Playing Time MP', 'Playing Time Min', 'Playing Time Mn/MP', 'Playing Time Min%', 'Playing Time 90s', 'Starts Starts', 'Starts Mn/Start', 'Starts Compl', 'Subs Subs', 'Subs Mn/Sub', 'Subs unSub', 'Team Success PPM', 'Team Success onG', 'Team Success onGA', 'Team Success +/-', 'Team Success +/-90', 'Team Success On-Off', 'Team Success (xG) onxG', 'Team Success (xG) onxGA', 'Team Success (xG) xG+/-', 'Team Success (xG) xG+/-90', 'Team Success (xG) On-Off']],
          df6[['Touches Touches', 'Touches Def Pen', 'Touches Def 3rd', 'Touches Mid 3rd', 'Touches Att 3rd', 'Touches Att Pen', 'Touches Live', 'Take-Ons Att', 'Take-Ons Succ', 'Take-Ons Succ%', 'Take-Ons Tkld', 'Take-Ons Tkld%', 'Carries Carries', 'Carries TotDist', 'Carries PrgDist', 'Carries PrgC', 'Carries 1/3', 'Carries CPA', 'Carries Mis', 'Carries Dis', 'Receiving Rec', 'Receiving PrgR']],
          df7[['Standard Gls', 'Standard Sh', 'Standard SoT', 'Standard SoT%','Standard Sh/90', 'Standard SoT/90', 'Standard G/Sh', 'Standard G/SoT', 'Standard Dist', 'Standard FK', 'Standard PK', 'Standard PKatt', 'Expected xG', 'Expected npxG', 'Expected npxG/Sh', 'Expected G-xG', 'Expected np:G-xG']]]
players = pd.concat(frames, axis=1)

In [18]:
frames = [df8[['Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'Playing Time MP', 'Playing Time Starts', 'Playing Time Min', 'Playing Time 90s', 'Performance GA', 'Performance GA90', 'Performance SoTA', 'Performance Saves', 'Performance Save%', 'Performance W', 'Performance D', 'Performance L', 'Performance CS', 'Performance CS%', 'Penalty Kicks PKatt', 'Penalty Kicks PKA', 'Penalty Kicks PKsv', 'Penalty Kicks PKm']],
          df9[['Goals GA', 'Goals PKA', 'Goals FK', 'Goals CK', 'Goals OG', 'Expected PSxG', 'Expected PSxG/SoT', 'Expected PSxG+/-', 'Expected /90', 'Launched Cmp', 'Launched Att', 'Launched Cmp%', 'Passes Att (GK)', 'Passes Thr', 'Passes Launch%', 'Passes AvgLen', 'Goal Kicks Att', 'Goal Kicks Launch%', 'Goal Kicks AvgLen', 'Crosses Opp', 'Crosses Stp', 'Crosses Stp%', 'Sweeper #OPA', 'Sweeper #OPA/90', 'Sweeper AvgDist']]]
goalkeepers = pd.concat(frames, axis=1)

In [19]:
for col in players.columns.to_list():
    print((col, players[col].isnull().sum()))

('Player', 153)
('Nation', 153)
('Pos', 153)
('Squad', 153)
('Born', 153)
('Tackles Tkl', 153)
('Tackles TklW', 153)
('Tackles Def 3rd', 153)
('Tackles Mid 3rd', 153)
('Tackles Att 3rd', 153)
('Challenges Tkl', 153)
('Challenges Att', 153)
('Challenges Tkl%', 206)
('Challenges Lost', 153)
('Blocks Blocks', 153)
('Blocks Sh', 153)
('Blocks Pass', 153)
('Int', 153)
('Tkl+Int', 153)
('Clr', 153)
('Err', 153)
('SCA SCA', 153)
('SCA SCA90', 153)
('SCA Types PassLive', 153)
('SCA Types PassDead', 153)
('SCA Types TO', 153)
('SCA Types Sh', 153)
('SCA Types Fld', 153)
('SCA Types Def', 153)
('GCA GCA', 153)
('GCA GCA90', 153)
('GCA Types PassLive', 153)
('GCA Types PassDead', 153)
('GCA Types TO', 153)
('GCA Types Sh', 153)
('GCA Types Fld', 153)
('GCA Types Def', 153)
('Performance CrdY', 153)
('Performance CrdR', 153)
('Performance 2CrdY', 153)
('Performance Fls', 153)
('Performance Fld', 153)
('Performance Off', 153)
('Performance Crs', 153)
('Performance Int', 153)
('Performance TklW', 15

In [20]:
for col in players.columns.to_list():
    print((col, players[col].isnull().sum()-153))

('Player', 0)
('Nation', 0)
('Pos', 0)
('Squad', 0)
('Born', 0)
('Tackles Tkl', 0)
('Tackles TklW', 0)
('Tackles Def 3rd', 0)
('Tackles Mid 3rd', 0)
('Tackles Att 3rd', 0)
('Challenges Tkl', 0)
('Challenges Att', 0)
('Challenges Tkl%', 53)
('Challenges Lost', 0)
('Blocks Blocks', 0)
('Blocks Sh', 0)
('Blocks Pass', 0)
('Int', 0)
('Tkl+Int', 0)
('Clr', 0)
('Err', 0)
('SCA SCA', 0)
('SCA SCA90', 0)
('SCA Types PassLive', 0)
('SCA Types PassDead', 0)
('SCA Types TO', 0)
('SCA Types Sh', 0)
('SCA Types Fld', 0)
('SCA Types Def', 0)
('GCA GCA', 0)
('GCA GCA90', 0)
('GCA Types PassLive', 0)
('GCA Types PassDead', 0)
('GCA Types TO', 0)
('GCA Types Sh', 0)
('GCA Types Fld', 0)
('GCA Types Def', 0)
('Performance CrdY', 0)
('Performance CrdR', 0)
('Performance 2CrdY', 0)
('Performance Fls', 0)
('Performance Fld', 0)
('Performance Off', 0)
('Performance Crs', 0)
('Performance Int', 0)
('Performance TklW', 0)
('Performance PKwon', 0)
('Performance PKcon', 0)
('Performance OG', 0)
('Performance Re

In [21]:
cols = ['Challenges Tkl%', 'Aerial Duels Won%', 'Short Cmp%', 'Medium Cmp%', 'Long Cmp%', 'Starts Mn/Start', 'Subs Mn/Sub', 'Team Success On-Off', 'Team Success (xG) On-Off', 'Take-Ons Succ%', 'Take-Ons Tkld%', 'Standard SoT%', 'Standard G/Sh', 'Standard G/SoT', 'Standard Dist', 'Expected npxG/Sh']
for col in cols:
    players[col] = players[col].fillna(0)

In [22]:
for col in goalkeepers.columns.to_list():
    print((col, goalkeepers[col].isnull().sum()))

('Player', 0)
('Nation', 0)
('Pos', 0)
('Squad', 0)
('Age', 0)
('Born', 0)
('Playing Time MP', 0)
('Playing Time Starts', 0)
('Playing Time Min', 0)
('Playing Time 90s', 0)
('Performance GA', 0)
('Performance GA90', 0)
('Performance SoTA', 0)
('Performance Saves', 0)
('Performance Save%', 0)
('Performance W', 0)
('Performance D', 0)
('Performance L', 0)
('Performance CS', 0)
('Performance CS%', 1)
('Penalty Kicks PKatt', 0)
('Penalty Kicks PKA', 0)
('Penalty Kicks PKsv', 0)
('Penalty Kicks PKm', 0)
('Goals GA', 0)
('Goals PKA', 0)
('Goals FK', 0)
('Goals CK', 0)
('Goals OG', 0)
('Expected PSxG', 0)
('Expected PSxG/SoT', 0)
('Expected PSxG+/-', 0)
('Expected /90', 0)
('Launched Cmp', 0)
('Launched Att', 0)
('Launched Cmp%', 0)
('Passes Att (GK)', 0)
('Passes Thr', 0)
('Passes Launch%', 0)
('Passes AvgLen', 0)
('Goal Kicks Att', 0)
('Goal Kicks Launch%', 0)
('Goal Kicks AvgLen', 0)
('Crosses Opp', 0)
('Crosses Stp', 0)
('Crosses Stp%', 0)
('Sweeper #OPA', 0)
('Sweeper #OPA/90', 0)
('Swee

In [23]:
goalkeepers['Performance CS%'] = goalkeepers['Performance CS%'].fillna(0)

In [24]:
players = players.assign(Score = scoring_utils.player_score(players))

In [25]:
players = players.assign(Gk_Score = scoring_utils.goalkeeper_score(players, goalkeepers))

In [26]:
players.to_csv('players.csv')

In [27]:
team_dict = {}
for team, team_df in players.groupby('Squad'):
    team_dict[team] = team_df
team_dict

{'Arsenal':                         Player Nation    Pos    Squad    Born  Tackles Tkl  \
 24   Pierre-Emerick Aubameyang    GAB     FW  Arsenal  1989.0         11.0   
 31             Folarin Balogun    USA     FW  Arsenal  2001.0          1.0   
 76              Calum Chambers    ENG     DF  Arsenal  1995.0          5.0   
 128         Gabriel Dos Santos    BRA     DF  Arsenal  1997.0         50.0   
 141             Mohamed Elneny    EGY     MF  Arsenal  1992.0         16.0   
 216                Rob Holding    ENG     DF  Arsenal  1995.0         13.0   
 261             Sead Kolašinac    BIH     DF  Arsenal  1993.0          0.0   
 271        Alexandre Lacazette    FRA     FW  Arsenal  1991.0         27.0   
 279                 Bernd Leno    GER     GK  Arsenal  1992.0          1.0   
 303     Ainsley Maitland-Niles    ENG     MF  Arsenal  1997.0          8.0   
 308                 Pablo Marí    ESP     DF  Arsenal  1993.0          5.0   
 310         Gabriel Martinelli    BRA  F

In [28]:
feature_coef = pd.read_csv('https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/model1/feature_coefficients.csv')
for col in feature_coef.columns:
    print((col, feature_coef[col]))

('Unnamed: 0', 0    0
Name: Unnamed: 0, dtype: int64)
('Sweeper AvgDist', 0   -0.012394
Name: Sweeper AvgDist, dtype: float64)
('Passes AvgLen', 0    0.005435
Name: Passes AvgLen, dtype: float64)
('Tackles Tkl', 0    0.000444
Name: Tackles Tkl, dtype: float64)
('Tackles TklW', 0   -0.000284
Name: Tackles TklW, dtype: float64)
('Tackles Def 3rd', 0   -0.000317
Name: Tackles Def 3rd, dtype: float64)
('Tackles Mid 3rd', 0    0.000581
Name: Tackles Mid 3rd, dtype: float64)
('Tackles Att 3rd', 0    0.00018
Name: Tackles Att 3rd, dtype: float64)
('Challenges Tkl', 0    0.00469
Name: Challenges Tkl, dtype: float64)
('Challenges Att', 0    0.000083
Name: Challenges Att, dtype: float64)
('Challenges Tkl%', 0   -0.074273
Name: Challenges Tkl%, dtype: float64)
('Challenges Lost', 0   -0.004607
Name: Challenges Lost, dtype: float64)
('Blocks Blocks', 0   -0.000977
Name: Blocks Blocks, dtype: float64)
('Blocks Sh', 0   -0.000335
Name: Blocks Sh, dtype: float64)
('Blocks Pass', 0   -0.000641
Name: B

In [29]:
arsenal_df = team_dict['Arsenal']
arsenal_df

Unnamed: 0,Player,Nation,Pos,Squad,Born,Tackles Tkl,Tackles TklW,Tackles Def 3rd,Tackles Mid 3rd,Tackles Att 3rd,...,Standard FK,Standard PK,Standard PKatt,Expected xG,Expected npxG,Expected npxG/Sh,Expected G-xG,Expected np:G-xG,Score,Gk_Score
24,Pierre-Emerick Aubameyang,GAB,FW,Arsenal,1989.0,11.0,5.0,6.0,4.0,1.0,...,0.0,0.0,2.0,5.9,4.2,0.13,-1.9,-0.2,,0
31,Folarin Balogun,USA,FW,Arsenal,2001.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.2,0.2,0.07,-0.2,-0.2,,0
76,Calum Chambers,ENG,DF,Arsenal,1995.0,5.0,3.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.1,0.1,0.04,-0.1,-0.1,,0
128,Gabriel Dos Santos,BRA,DF,Arsenal,1997.0,50.0,30.0,36.0,12.0,2.0,...,0.0,0.0,0.0,3.2,3.2,0.11,1.8,1.8,,0
141,Mohamed Elneny,EGY,MF,Arsenal,1992.0,16.0,8.0,10.0,5.0,1.0,...,0.0,0.0,0.0,0.2,0.2,0.02,-0.2,-0.2,,0
216,Rob Holding,ENG,DF,Arsenal,1995.0,13.0,9.0,11.0,1.0,1.0,...,0.0,0.0,0.0,0.4,0.4,0.09,0.6,0.6,,0
261,Sead Kolašinac,BIH,DF,Arsenal,1993.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0
271,Alexandre Lacazette,FRA,FW,Arsenal,1991.0,27.0,10.0,8.0,13.0,6.0,...,1.0,2.0,3.0,8.4,6.0,0.14,-4.4,-4.0,,0
279,Bernd Leno,GER,GK,Arsenal,1992.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0
303,Ainsley Maitland-Niles,ENG,MF,Arsenal,1997.0,8.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.1,0.1,0.03,-0.1,-0.1,,0
