In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RidgeCV, SGDRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, cross_validate
import warnings
warnings.filterwarnings("ignore")

In [2]:
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/datasets/premier-league-players-22-23.csv'
df1 = pd.read_csv(url)

In [3]:
url = 'https://raw.githubusercontent.com/sambuddharay/FinalYearProject/main/datasets/premier-league-teams-22-23.csv'
df2 = pd.read_csv(url)

In [4]:
df1.info

<bound method DataFrame.info of       Rk               Player    Pos         Squad  Age  Born   90s  Gls  Sh  \
0      1     Brenden Aaronson  MF,FW  Leeds United   21  2000  26.4    1  41   
1      2        George Abbott     MF     Tottenham   16  2005   0.0    0   0   
2      3            Che Adams     FW   Southampton   26  1996  22.1    5  47   
3      4          Tyler Adams     MF  Leeds United   23  1999  24.0    0   4   
4      5     Tosin Adarabioyo     DF        Fulham   24  1997  23.2    1  16   
..   ...                  ...    ...           ...  ...   ...   ...  ...  ..   
564  565        Jordan Zemura     DF   Bournemouth   22  1999  16.2    0   4   
565  566  Oleksandr Zinchenko     DF       Arsenal   25  1996  23.5    1  28   
566  567         Hakim Ziyech  FW,MF       Chelsea   29  1993   7.3    0  16   
567  568           Kurt Zouma     DF      West Ham   27  1994  22.1    2  19   
568  569      Martin Odegaard     MF       Arsenal   23  1998  34.7   15  93   

     So

In [5]:
df2.info

<bound method DataFrame.info of               Squad  NoofPl   Age  Poss  MP  90s   Sh  SoT  SoT%  Sh/90  ...  \
0           Arsenal      26  24.7  59.3  38   38  589  194  32.9  15.50  ...   
1       Aston Villa      26  27.0  49.3  38   38  427  145  34.0  11.24  ...   
2       Bournemouth      31  26.3  40.4  38   38  358  126  35.2   9.42  ...   
3         Brentford      25  26.2  43.8  38   38  400  147  36.8  10.53  ...   
4          Brighton      29  26.3  60.2  38   38  607  219  36.1  15.97  ...   
5           Chelsea      32  26.3  58.7  38   38  481  151  31.4  12.66  ...   
6    Crystal Palace      26  26.7  46.3  38   38  423  133  31.4  11.13  ...   
7           Everton      28  26.6  42.8  38   38  426  143  33.6  11.21  ...   
8            Fulham      29  28.2  48.8  38   38  420  137  32.6  11.05  ...   
9      Leeds United      29  25.3  47.0  38   38  459  141  30.7  12.08  ...   
10   Leicester City      28  26.1  47.7  38   38  412  142  34.5  10.84  ...   
11      

In [6]:
df1.shape

(569, 128)

In [7]:
df2.shape

(20, 134)

In [8]:
list(df1.columns.values)

['Rk',
 'Player',
 'Pos',
 'Squad',
 'Age',
 'Born',
 '90s',
 'Gls',
 'Sh',
 'SoT',
 'SoT%',
 'Sh/90',
 'SoT/90',
 'G/Sh',
 'G/SoT',
 'Dist',
 'FK',
 'PK',
 'PKatt',
 'xG',
 'npxG',
 'npxG/Sh',
 'G-xG',
 'np:G-xG',
 'TotalCmp',
 'TotalAtt',
 'TotalCmp%',
 'TotDist',
 'PrgDist',
 'ShortCmp',
 'ShortAtt',
 'ShortCmp%',
 'MediumCmp',
 'MediumAtt',
 'MediumCmp%',
 'LongCmp',
 'LongAtt',
 'LongCmp%',
 'Ast',
 'xAG',
 'xA',
 'A-xAG',
 'KP',
 'FinThP',
 'PPA',
 'CrsPA',
 'PrgP',
 'T-Tkl',
 'T-TklW',
 'T-Def 3rd',
 'T-Mid 3rd',
 'T-Att 3rd',
 'Ch-Tkl',
 'Ch-Att',
 'Ch-Tkl%',
 'Ch-Lost',
 'Blocks',
 'BlockSh',
 'BlockPass',
 'Int',
 'Tkl+Int',
 'Clr',
 'Err',
 'Touches',
 'Touches Def Pen',
 'Touches Def 3rd',
 'Touches Mid 3rd',
 'Touches Att 3rd',
 'Touches Att Pen',
 'Live Touches',
 'TOAtt',
 'TOSucc',
 'TOSucc%',
 'Tkld',
 'Tkld%',
 'Carries',
 'TotDist.1',
 'PrgDist.1',
 'PrgC',
 'FinThC',
 'CPA',
 'Mis',
 'Dis',
 'Rec',
 'PrgR',
 'CrdY',
 'CrdR',
 '2CrdY',
 'Fls',
 'Fld',
 'Off',
 'Crs',

In [9]:
forwards = df1.loc[df1.Pos.str.contains("FW")]

In [10]:
fw_fd = forwards[[
 'Player',
 'Pos',
 'Squad',
 'Age',
 'Born',
 '90s',
 'Gls',
 'Sh',
 'SoT',
 'SoT%',
 'Sh/90',
 'SoT/90',
 'G/Sh',
 'G/SoT',
 'Dist',
 'FK',
 'PK',
 'PKatt',
 'xG',
 'npxG',
 'npxG/Sh',
 'G-xG',
 'np:G-xG',
]].sort_values(by=['Squad'])

In [11]:
fw_fd.head(60)

Unnamed: 0,Player,Pos,Squad,Age,Born,90s,Gls,Sh,SoT,SoT%,...,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
379,Ethan Nwaneri,FW,Arsenal,15,2007,0.0,0,0,0,,...,,,0,0,0,0.0,0.0,,0.0,0.0
249,Gabriel Jesus,FW,Arsenal,25,1997,22.9,11,76,31,40.8,...,0.32,10.6,0,1,1,14.0,13.2,0.18,-3.0,-3.2
327,Gabriel Martinelli,FW,Arsenal,21,2001,31.0,15,79,30,38.0,...,0.5,15.4,2,0,0,9.3,9.3,0.12,5.7,5.7
530,Fabio Vieira,"MF,FW",Arsenal,22,2000,5.7,1,15,2,13.3,...,0.5,20.1,0,0,0,0.8,0.8,0.06,0.2,0.2
451,Bukayo Saka,FW,Arsenal,20,2001,35.3,14,86,29,33.7,...,0.41,16.3,1,2,3,11.2,9.1,0.11,2.8,2.9
521,Leandro Trossard,FW,Arsenal,27,1994,10.3,1,17,5,29.4,...,0.2,17.5,0,0,0,1.8,1.8,0.11,-0.8,-0.8
370,Reiss Nelson,FW,Arsenal,22,1999,2.4,3,12,7,58.3,...,0.43,16.4,1,0,0,1.1,1.1,0.09,1.9,1.9
481,Emile Smith Rowe,"MF,FW",Arsenal,22,2000,1.9,0,4,1,25.0,...,0.0,15.8,0,0,0,0.3,0.3,0.07,-0.3,-0.3
376,Eddie Nketiah,FW,Arsenal,23,1999,12.1,4,41,14,34.1,...,0.29,11.3,0,0,0,8.2,8.2,0.2,-4.2,-4.2
110,Philippe Coutinho,"MF,FW",Aston Villa,30,1992,8.8,1,21,8,38.1,...,0.13,21.6,2,0,0,1.4,1.4,0.07,-0.4,-0.4


In [12]:
fw_fd_fil = fw_fd[fw_fd['90s']>=0]

In [13]:
fw_fd_fil.shape

(212, 23)

In [14]:
team_fw = {}
for team, team_df in fw_fd_fil.groupby('Squad'):
    team_fw[team]=team_df
team_fw

{'Arsenal':                  Player    Pos    Squad  Age  Born   90s  Gls  Sh  SoT  SoT%  \
 379       Ethan Nwaneri     FW  Arsenal   15  2007   0.0    0   0    0   NaN   
 249       Gabriel Jesus     FW  Arsenal   25  1997  22.9   11  76   31  40.8   
 327  Gabriel Martinelli     FW  Arsenal   21  2001  31.0   15  79   30  38.0   
 530        Fabio Vieira  MF,FW  Arsenal   22  2000   5.7    1  15    2  13.3   
 451         Bukayo Saka     FW  Arsenal   20  2001  35.3   14  86   29  33.7   
 521    Leandro Trossard     FW  Arsenal   27  1994  10.3    1  17    5  29.4   
 370        Reiss Nelson     FW  Arsenal   22  1999   2.4    3  12    7  58.3   
 481    Emile Smith Rowe  MF,FW  Arsenal   22  2000   1.9    0   4    1  25.0   
 376       Eddie Nketiah     FW  Arsenal   23  1999  12.1    4  41   14  34.1   
 
      ...  G/SoT  Dist  FK  PK  PKatt    xG  npxG  npxG/Sh  G-xG  np:G-xG  
 379  ...    NaN   NaN   0   0      0   0.0   0.0      NaN   0.0      0.0  
 249  ...   0.32  10.6   

In [15]:
fw_fd_fil.sum()[['Sh/90','SoT/90']]/fw_fd_fil['90s'].sum()

Sh/90     0.165021
SoT/90    0.055924
dtype: object

In [16]:
team_fw.keys()

dict_keys(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Leeds United', 'Leicester City', 'Liverpool', 'Manchester City', 'Manchester Utd', 'Newcastle Utd', "Nott'ham Forest", 'Southampton', 'Tottenham', 'West Ham', 'Wolves'])

In [17]:
cols = ['90s','Gls','Sh','SoT','SoT%','Sh/90','SoT/90','G/Sh','G/SoT','Dist','FK','PK','PKatt','xG','npxG','npxG/Sh','G-xG','np:G-xG']
X_train = pd.DataFrame(columns=cols)
for key in team_fw.keys():
        team = team_fw[key]
        arr=team[cols].sum()
        weight_sum = team['90s'].sum()
        arr['Sh/90'] = arr['Sh']/weight_sum
        arr['SoT/90'] = arr['SoT']/weight_sum
        arr['SoT%'] = arr['SoT']*100/arr['Sh']
        X_train = X_train.append(arr, ignore_index=True)
X_train

Unnamed: 0,90s,Gls,Sh,SoT,SoT%,Sh/90,SoT/90,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG
0,121.6,49.0,330.0,119.0,36.060606,2.713816,0.978618,0.94,2.65,123.4,4.0,3.0,4.0,46.7,43.8,0.94,2.3,2.2
1,165.2,40.0,321.0,119.0,37.071651,1.943099,0.720339,0.98,2.73,159.4,7.0,3.0,4.0,43.5,40.4,1.0,-3.5,-3.4
2,155.1,28.0,271.0,99.0,36.531365,1.74726,0.638298,1.11,3.0,178.7,11.0,0.0,0.0,31.8,31.8,1.23,-3.8,-3.8
3,108.0,36.0,222.0,92.0,41.441441,2.055556,0.851852,0.49,1.14,123.2,10.0,7.0,8.0,39.4,33.1,1.05,-3.4,-4.1
4,134.6,45.0,348.0,147.0,42.241379,2.585438,1.092125,1.31,2.96,154.0,2.0,0.0,0.0,46.3,46.3,1.33,-1.3,-1.3
5,123.6,24.0,282.0,106.0,37.588652,2.281553,0.857605,0.77,1.83,198.5,6.0,1.0,1.0,32.7,31.9,1.19,-8.7,-8.9
6,173.5,33.0,314.0,101.0,32.165605,1.809798,0.582133,0.72,2.18,134.0,21.0,1.0,3.0,30.9,28.4,0.74,2.1,3.6
7,134.7,20.0,246.0,93.0,37.804878,1.826281,0.690423,0.55,2.07,149.9,10.0,3.0,3.0,27.9,25.6,1.07,-7.9,-8.6
8,122.6,36.0,233.0,82.0,35.193133,1.900489,0.668842,1.22,3.22,143.6,4.0,4.0,8.0,31.4,25.1,1.15,4.6,6.9
9,151.0,35.0,308.0,99.0,32.142857,2.039735,0.655629,0.9,2.84,249.8,14.0,1.0,3.0,36.3,33.9,1.33,-1.3,0.1


In [18]:
list(df2.columns.values)

['Squad',
 'NoofPl',
 'Age',
 'Poss',
 'MP',
 '90s',
 'Sh',
 'SoT',
 'SoT%',
 'Sh/90',
 'SoT/90',
 'G/Sh',
 'G/SoT',
 'Dist',
 'FK',
 'PK',
 'PKatt',
 'xG',
 'npxG',
 'npxG/Sh',
 'G-xG',
 'np:G-xG',
 'TotalCmp',
 'TotalAtt',
 'TotalCmp%',
 'TotDist',
 'PrgDist',
 'ShortCmp',
 'ShortAtt',
 'ShortCmp%',
 'MixedCmp',
 'MixedAtt',
 'MixedCmp%',
 'LongCmp',
 'LongAtt',
 'LongCmp%',
 'Ast',
 'xAG',
 'xA',
 'A-xAG',
 'KP',
 'FinThPasses',
 'PPA',
 'CrsPA',
 'PrgP',
 'LivePasses',
 'DeadPasses',
 'FK.1',
 'TB',
 'Sw',
 'Crs',
 'TI',
 'CK',
 'InCK',
 'OutCK',
 'StrCK',
 'Off',
 'Blocks',
 'SCA',
 'SCA90',
 'SCAPassLive',
 'SCAPassDead',
 'SCATO',
 'SCASh',
 'SCAFld',
 'SCADef',
 'GCA',
 'GCA90',
 'GCAPassLive',
 'GCAPassDead',
 'GCATO',
 'GCASh',
 'GCAFld',
 'GCADef',
 'T-Tkl',
 'T-TklW',
 'T-Def 3rd',
 'T-Mid 3rd',
 'T-Att 3rd',
 'Ch-Tkl',
 'Ch-Att',
 'Ch-Tkl%',
 'Ch-Lost',
 'Blocks.1',
 'BlockSh',
 'BlockPass',
 'Int',
 'Tkl+Int',
 'Clr',
 'Err',
 'Touches',
 'Touches Def Pen',
 'Touches Def 

In [19]:
cols = ['GF','Sh','SoT','PK','xG','npxG','G-xG','np:G-xG']
y_train = df2.filter(cols)
y_train

Unnamed: 0,GF,Sh,SoT,PK,xG,npxG,G-xG,np:G-xG
0,88,589,194,3,71.9,69.1,12.1,11.9
1,51,427,145,3,50.2,47.2,-1.2,-1.2
2,37,358,126,0,38.6,38.6,-1.6,-1.6
3,58,400,147,7,56.8,50.6,-0.8,-1.6
4,72,607,219,6,73.3,68.9,-5.3,-6.9
5,38,481,151,3,49.5,47.2,-12.5,-13.2
6,40,423,133,1,39.3,36.8,-1.3,0.2
7,34,426,143,3,45.2,42.8,-13.2,-13.8
8,55,420,137,5,46.2,39.1,5.8,7.9
9,48,459,141,1,47.4,45.0,-2.4,-1.0


In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [21]:
lr.score(X_train, y_train)

0.9558364531413117

In [22]:
ridge = Ridge()
ridge.fit(X_train, y_train)

lasso = Lasso()
lasso.fit(X_train, y_train)

In [23]:
ridge.score(X_train, y_train)

0.9045082080635665

In [24]:
lasso.score(X_train, y_train)

0.8505341677810061