In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Looking at our Dataset

In [2]:
df = pd.read_excel('data/final_dataset.xlsx')

In [3]:
df.head(30)

Unnamed: 0,Player,Club,Age,Position,Nation,Value,Contract Years Left,League,Squad (20/21),MP (20/21),...,Offsides (17/18),Crosses (17/18),Interceptions (17/18),Penalty Kicks Won (17/18),Penalties Conceded (17/18),Own Goals (17/18),Total Loose Balls Recovered (17/18),Aerial Duel Won (17/18),Aerial Duel Lost (17/18),% Aerial Duels Won (17/18)
0,Kylian Mbappe,Paris Saint-Germain,22,attack,France,144000000,1,Ligue 1,Paris S-G,31.0,...,23.0,62.0,1.0,1.0,0.0,0.0,98.0,1.0,4.0,20.0
1,Erling Haaland,Borussia Dortmund,21,attack,Norway,117000000,3,Bundesliga,Dortmund,28.0,...,,,,,,,,,,
2,Harry Kane,Tottenham Hotspur,28,attack,England,108000000,3,Premier League,Tottenham,35.0,...,43.0,24.0,7.0,1.0,0.0,0.0,124.0,69.0,111.0,38.3
3,Jadon Sancho,Manchester United,21,attack,England,90000000,5,Premier League,Dortmund,26.0,...,1.0,15.0,6.0,0.0,0.0,0.0,57.0,3.0,14.0,17.6
4,Mohamed Salah,Liverpool FC,29,attack,Egypt,90000000,2,Premier League,Liverpool,37.0,...,18.0,50.0,13.0,1.0,0.0,0.0,219.0,19.0,58.0,24.7
5,Neymar,Paris Saint-Germain,29,attack,Brazil,90000000,4,Ligue 1,Paris S-G,18.0,...,10.0,58.0,3.0,2.0,0.0,0.0,110.0,2.0,2.0,50.0
6,Kevin De Bruyne,Manchester City,30,midfield,Belgium,90000000,4,Premier League,Manchester City,25.0,...,2.0,146.0,42.0,1.0,0.0,0.0,278.0,14.0,18.0,43.8
7,Romelu Lukaku,Chelsea FC,28,attack,Belgium,90000000,5,Premier League,Inter,36.0,...,19.0,37.0,6.0,0.0,0.0,0.0,101.0,127.0,122.0,51.0
8,Joshua Kimmich,Bayern Munich,26,midfield,Germany,81000000,4,Bundesliga,Bayern Munich,27.0,...,2.0,107.0,17.0,1.0,1.0,0.0,253.0,16.0,18.0,47.1
9,Bruno Fernandes,Manchester United,26,midfield,Portugal,81000000,4,Premier League,Manchester Utd,37.0,...,,,,,,,,,,


# Baseline Linear Regression Model

In [4]:
def baseline_linear_regression(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    scores_test = []

    for position in positions:

        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:11].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,)

        ss= StandardScaler()
        lr = LinearRegression()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        lr.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(lr, X_train_scaled, y_train, scoring="neg_root_mean_squared_error")
        cross_val_test = cross_val_score(lr, X_test_scaled, y_test, scoring="neg_root_mean_squared_error")
        
        scores_train.append(-(cross_val_train.mean()))
        scores_test.append(-(cross_val_test.mean()))
        
        
    print(f'Attackers Train RMSE = ${round(scores_train[0],2)}')
    print(f'Attackers Test RMSE = ${round(scores_test[0],2)}')
    print("----------------------------------------")
    print(f'Midfielders Train RMSE = ${round(scores_train[1],2)}')
    print(f'Midfielders Test RMSE = {round(scores_test[1],2)}')
    print("----------------------------------------")
    print(f'Defenders Train RMSE = ${round(scores_train[2],2)}')
    print(f'Defenders Test RMSE = ${round(scores_test[2],2)}')

In [5]:
baseline_linear_regression(df)

Attackers Train RMSE = $13773308.04
Attackers Test RMSE = $10278223.03
----------------------------------------
Midfielders Train RMSE = $12753372.38
Midfielders Test RMSE = 12687462.32
----------------------------------------
Defenders Train RMSE = $10415505.1
Defenders Test RMSE = $9262393.02
