In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



import tools.helpers as th
import tools.feature_eng as fe

# Looking at our Dataset

In [2]:
df = pd.read_excel('data/final_dataset.xlsx')

In [3]:
df

Unnamed: 0,Player,Club,Age,Position,Nation,Value,Contract Years Left,League,Squad (20/21),MP (20/21),...,Offsides (17/18),Crosses (17/18),Interceptions (17/18),Penalty Kicks Won (17/18),Penalties Conceded (17/18),Own Goals (17/18),Total Loose Balls Recovered (17/18),Aerial Duel Won (17/18),Aerial Duel Lost (17/18),% Aerial Duels Won (17/18)
0,Kylian Mbappe,Paris Saint-Germain,22,attack,France,144000000,1,Ligue 1,Paris S-G,31.0,...,23.0,62.0,1.0,1.0,0.0,0.0,98.0,1.0,4.0,20.0
1,Erling Haaland,Borussia Dortmund,21,attack,Norway,117000000,3,Bundesliga,Dortmund,28.0,...,,,,,,,,,,
2,Harry Kane,Tottenham Hotspur,28,attack,England,108000000,3,Premier League,Tottenham,35.0,...,43.0,24.0,7.0,1.0,0.0,0.0,124.0,69.0,111.0,38.3
3,Jadon Sancho,Manchester United,21,attack,England,90000000,5,Premier League,Dortmund,26.0,...,1.0,15.0,6.0,0.0,0.0,0.0,57.0,3.0,14.0,17.6
4,Mohamed Salah,Liverpool FC,29,attack,Egypt,90000000,2,Premier League,Liverpool,37.0,...,18.0,50.0,13.0,1.0,0.0,0.0,219.0,19.0,58.0,24.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2070,Matis Carvalho,Montpellier HSC,22,Goalkeeper,Portugal,180000,1,Ligue 1,,,...,,,,,,,,,,
2071,Lorenzo Andrenacci,Genoa CFC,26,Goalkeeper,Italy,135000,3,Serie A,,,...,,,,,,,,,,
2072,Mamadou Doucoure,Borussia Mönchengladbach,23,Defender,France,90000,3,Bundesliga,,,...,,,,,,,,,,
2073,Michael Langer,FC Schalke 04,36,Goalkeeper,Austria,90000,1,Bundesliga,Schalke 04,3.0,...,,,,,,,,,,


# Separating data by Position

In [4]:
attack = df[df['Position']=='attack']
midfield = df[df['Position']=='midfield']
defence = df[df['Position']=='Defender']

# Finding the Features with the highest correlation to Price

In [5]:
attack.corr()['Value'].sort_values(ascending=False)[1:11]

Carries into Attacking Penalty Box (20/21)    0.639664
Non-penalty xG+ xA (20/21)                    0.638520
Touches in Attacking Penalty Box (20/21)      0.635978
Goal Creating Actions (20/21)                 0.628638
Touches in Attacking 3rd (20/21)              0.604891
Passes Leading to Goals (20/21)               0.592585
Shot-Creating Actions (20/21)                 0.580837
Passes Leading to Shot Attempt (20/21)        0.578736
Non-Penalty xG (20/21)                        0.578728
Progressive Passes Received (20/21)           0.572992
Name: Value, dtype: float64

In [6]:
#List of features for to include in dataset for modeling.
top_features_attack = ['Value',
                    'Carries into Attacking Penalty Box (20/21)',   
                    'Non-penalty xG+ xA (20/21)',                    
                    'Touches in Attacking Penalty Box (20/21)',      
                    'Goal Creating Actions (20/21)',                 
                    'Touches in Attacking 3rd (20/21)',             
                    'Passes Leading to Goals (20/21)',             
                    'Shot-Creating Actions (20/21)',                
                    'Passes Leading to Shot Attempt (20/21)',     
                    'Non-Penalty xG (20/21)',                       
                    'Progressive Passes Received (20/21)']    

# First Linear Regression Model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [8]:
#Using top features identified earlier
attack_df = attack[top_features_attack]
attack_df = attack_df.dropna()

In [9]:
X = attack_df.drop('Value',axis=1)
y = attack_df['Value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,)

ss= StandardScaler()
lr = LinearRegression()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

lr.fit(X_train_scaled,y_train);

In [10]:
lr.score(X_test_scaled,y_test)

0.562724155180486

In [11]:
cross_val_score(lr, X_test_scaled, y_test, cv=5, scoring='r2')

array([0.65238629, 0.5853321 , 0.43534675, 0.09001192, 0.14076495])

In [12]:
predictions = lr.predict(X_test_scaled)


RMSE = mean_squared_error(y_test, predictions, squared=False) # False returns RMSE value
print('RMSE: $', round(RMSE, 2))


RMSE: $ 13681450.69
