In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import warnings
import itertools

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error

# Looking at our Dataset

In [3]:
df = pd.read_excel('data/no_nans_data.xlsx')
df['Contract Years Left'] = df['Contract Years Left'].apply(lambda x:int(x))

In [4]:
df

Unnamed: 0,Player,Club,Age,Position,Nation,Value,Contract Years Left,League,MP (20/21),Starts (20/21),...,Offsides (17/18),Crosses (17/18),Interceptions (17/18),Penalty Kicks Won (17/18),Penalties Conceded (17/18),Own Goals (17/18),Total Loose Balls Recovered (17/18),Aerial Duel Won (17/18),Aerial Duel Lost (17/18),% Aerial Duels Won (17/18)
0,Kylian Mbappe,Paris Saint-Germain,22,attack,France,144000000,1,Ligue 1,31.0,27.0,...,23.0,62.0,1.0,1.0,0.0,0.0,98.0,1.0,4.0,20.0
1,Erling Haaland,Borussia Dortmund,21,attack,Norway,117000000,3,Bundesliga,28.0,27.0,...,9.0,8.5,5.0,0.5,0.0,0.0,70.0,29.0,31.5,47.9
2,Harry Kane,Tottenham Hotspur,28,attack,England,108000000,3,Premier League,35.0,35.0,...,43.0,24.0,7.0,1.0,0.0,0.0,124.0,69.0,111.0,38.3
3,Jadon Sancho,Manchester United,21,attack,England,90000000,5,Premier League,26.0,24.0,...,1.0,15.0,6.0,0.0,0.0,0.0,57.0,3.0,14.0,17.6
4,Mohamed Salah,Liverpool FC,29,attack,Egypt,90000000,2,Premier League,37.0,34.0,...,18.0,50.0,13.0,1.0,0.0,0.0,219.0,19.0,58.0,24.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,Phil Bardsley,Burnley FC,36,Defender,Scotland,270000,1,Premier League,4.0,3.0,...,0.0,20.0,24.0,0.0,1.0,0.0,141.0,27.0,32.0,45.8
1721,Luca Siligardi,Parma Calcio 1913,33,attack,Italy,270000,1,Serie A,20.5,11.0,...,1.0,21.5,11.5,1.0,0.0,0.0,81.5,10.5,14.0,50.2
1722,Mikel Rico,SD Huesca,36,midfield,Spain,270000,1,La Liga,32.0,27.0,...,0.0,7.0,17.0,0.0,0.0,0.0,156.0,2.0,7.0,22.2
1723,Jerome Hergault,FC Lorient,35,Defender,France,270000,1,Ligue 1,22.0,19.0,...,5.0,29.0,25.0,0.0,0.0,0.0,131.0,25.0,25.0,50.0


# Separating Data by Player Positions

In [5]:
attack = df[df['Position']=='attack']
midfield = df[df['Position']=='midfield']
defence = df[df['Position']=='Defender']

# Linear Regression

In [6]:
def linear_regression(df):

    #Creating a dummy dataframe for categorical variable columns
    dummy = []
    for a in [a for a in df.select_dtypes(include='object').drop(['Player'],axis=1).columns]:
        dummy.append(pd.get_dummies(df[a]))

    dummy_df = pd.concat([a for a in dummy],axis=1)

    #Removing Categorical Variable Columns
    df_num = df.select_dtypes(exclude='object')

    #Combining Continuous Variable Columns with Dummy Categorical Variable Columns
    df_final = (pd.concat([df_num,dummy_df],axis=1))

    ###########################################################################################

    #Separating target variable - 'Value'
    X = df_final.drop('Value',axis=1)
    y = df_final['Value']

    #Train-Test Split
    X_train1, X_test1, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
    ###########################################################################################
    
    #Normalization to make Data Distribution more Normal
    
    #Initiating MinMax Scaler 
    norm = MinMaxScaler()
    
    #Normalizing Train and Test Data
    X_train_norm = pd.DataFrame(norm.fit_transform(X_train1),columns = X_train1.columns)
    X_test_norm = pd.DataFrame(norm.transform(X_test1),columns = X_test1.columns)
    
    ###########################################################################################

    #Separating the categorical dummy columns because we don't want to pass them throuh StandardScaler.
    X_train_cat = X_train_norm.iloc[:, -197:]
    X_train_cat.reset_index(drop=True, inplace=True)

    X_test_cat = X_test_norm.iloc[:, -197:]
    X_test_cat.reset_index(drop=True, inplace=True)
    
    
    #Separating numerical columns to pass through StandardScaler
    X_train_num = X_train_norm.iloc[:, :538]
    X_train_num.reset_index(drop=True, inplace=True)

    X_test_num = X_test_norm.iloc[:, :538]
    X_test_num.reset_index(drop=True, inplace=True)

    ###########################################################################################

    #Initiating Standard Scaler
    ss= StandardScaler()

    #Standardizing numerical columns
    X_train_num_scaled = pd.DataFrame(ss.fit_transform(X_train_num),columns = X_train_num.columns)
    X_train_num_scaled.reset_index(drop=True, inplace=True)

    X_test_num_scaled = pd.DataFrame(ss.transform(X_test_num),columns = X_test_num.columns)
    X_test_num_scaled.reset_index(drop=True, inplace=True)


    #Combining Scaled Numerical Columns with Categorical Dummy Columns
    X_train_scaled = pd.concat([X_train_num_scaled,X_train_cat],axis=1)
    X_test_scaled = pd.concat([X_test_num_scaled,X_test_cat],axis=1)

    ###########################################################################################

    #Initiating the Linear Regressor
    lr = LinearRegression()

    #Fitting the Linear Regressor with Training Data
    lr.fit(X_train_scaled,y_train);

    ###########################################################################################

    cross_val_train = cross_val_score(lr, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
    cross_val_test = cross_val_score(lr, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)

    ###########################################################################################
    
    #Train Results
    cross_val_train_mean = round(-(cross_val_train.mean()),2)
    cross_val_train_max = round(-(cross_val_train.min()),2)
    cross_val_train_min = round(-(cross_val_train.max()),2)

    #Test Results
    cross_val_test_mean = round(-(cross_val_test.mean()),2)
    cross_val_test_max = round(-(cross_val_test.min()),2)
    cross_val_test_min = round(-(cross_val_test.max()),2)

    #Creating a Dataframe to display Validation Results
    results = pd.DataFrame(columns=['Train/Test','Avg RMSE','Max RMSE','Min RMSE'])
    results.loc[0] = ['Train',cross_val_train_mean,cross_val_train_max,cross_val_train_min]
    results.loc[1] = ['Test', cross_val_test_mean, cross_val_test_max, cross_val_test_min]

    return results


In [7]:
attack_lr = linear_regression(attack)
midfield_lr = linear_regression(midfield)
defence_lr = linear_regression(defence)

#  Lasso Regression

In [8]:
def lasso_regression(df):

    warnings.filterwarnings('ignore')
    
    #Creating a dummy dataframe for categorical variable columns
    dummy = []
    for a in [a for a in df.select_dtypes(include='object').drop(['Player'],axis=1).columns]:
        dummy.append(pd.get_dummies(df[a]))

    dummy_df = pd.concat([a for a in dummy],axis=1)

    #Removing Categorical Variable Columns
    df_num = df.select_dtypes(exclude='object')

    #Combining Continuous Variable Columns with Dummy Categorical Variable Columns
    df_final = (pd.concat([df_num,dummy_df],axis=1))

    ###########################################################################################

    #Separating target variable - 'Value'
    X = df_final.drop('Value',axis=1)
    y = df_final['Value']

    #Train-Test Split
    X_train1, X_test1, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
    ###########################################################################################
    
    #Normalization to make Data Distribution more Normal
    
    #Initiating MinMax Scaler 
    norm = MinMaxScaler()
    
    #Normalizing Train and Test Data
    X_train_norm = pd.DataFrame(norm.fit_transform(X_train1),columns = X_train1.columns)
    X_test_norm = pd.DataFrame(norm.transform(X_test1),columns = X_test1.columns)
    
    ###########################################################################################

    #Separating the categorical dummy columns because we don't want to pass them throuh StandardScaler.
    X_train_cat = X_train_norm.iloc[:, -197:]
    X_train_cat.reset_index(drop=True, inplace=True)

    X_test_cat = X_test_norm.iloc[:, -197:]
    X_test_cat.reset_index(drop=True, inplace=True)
    
    
    #Separating numerical columns to pass through StandardScaler
    X_train_num = X_train_norm.iloc[:, :538]
    X_train_num.reset_index(drop=True, inplace=True)

    X_test_num = X_test_norm.iloc[:, :538]
    X_test_num.reset_index(drop=True, inplace=True)

    ###########################################################################################

    #Initiating Standard Scaler
    ss= StandardScaler()

    #Standardizing numerical columns
    X_train_num_scaled = pd.DataFrame(ss.fit_transform(X_train_num),columns = X_train_num.columns)
    X_train_num_scaled.reset_index(drop=True, inplace=True)

    X_test_num_scaled = pd.DataFrame(ss.transform(X_test_num),columns = X_test_num.columns)
    X_test_num_scaled.reset_index(drop=True, inplace=True)


    #Combining Scaled Numerical Columns with Categorical Dummy Columns
    X_train_scaled = pd.concat([X_train_num_scaled,X_train_cat],axis=1)
    X_test_scaled = pd.concat([X_test_num_scaled,X_test_cat],axis=1)

    ###########################################################################################

    #Initiating the Linear Regressor
    l1 = Lasso()

    #Fitting the Linear Regressor with Training Data
    l1.fit(X_train_scaled,y_train);

    ###########################################################################################

    cross_val_train = cross_val_score(l1, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
    cross_val_test = cross_val_score(l1, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)

    ###########################################################################################
    
    #Train Results
    cross_val_train_mean = round(-(cross_val_train.mean()),2)
    cross_val_train_max = round(-(cross_val_train.min()),2)
    cross_val_train_min = round(-(cross_val_train.max()),2)

    #Test Results
    cross_val_test_mean = round(-(cross_val_test.mean()),2)
    cross_val_test_max = round(-(cross_val_test.min()),2)
    cross_val_test_min = round(-(cross_val_test.max()),2)

    #Creating a Dataframe to display Validation Results
    results = pd.DataFrame(columns=['Train/Test','Avg RMSE','Max RMSE','Min RMSE'])
    results.loc[0] = ['Train',cross_val_train_mean,cross_val_train_max,cross_val_train_min]
    results.loc[1] = ['Test', cross_val_test_mean, cross_val_test_max, cross_val_test_min]

    return results

In [9]:
attack_l1 = lasso_regression(attack)
midfield_l1 = lasso_regression(midfield)
defence_l1 = lasso_regression(defence)

# Ridge Regression

In [10]:
def ridge_regression(df):

    warnings.filterwarnings('ignore')
    
    #Creating a dummy dataframe for categorical variable columns
    dummy = []
    for a in [a for a in df.select_dtypes(include='object').drop(['Player'],axis=1).columns]:
        dummy.append(pd.get_dummies(df[a]))

    dummy_df = pd.concat([a for a in dummy],axis=1)

    #Removing Categorical Variable Columns
    df_num = df.select_dtypes(exclude='object')

    #Combining Continuous Variable Columns with Dummy Categorical Variable Columns
    df_final = (pd.concat([df_num,dummy_df],axis=1))

    ###########################################################################################

    #Separating target variable - 'Value'
    X = df_final.drop('Value',axis=1)
    y = df_final['Value']

    #Train-Test Split
    X_train1, X_test1, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
    ###########################################################################################
    
    #Normalization to make Data Distribution more Normal
    
    #Initiating MinMax Scaler 
    norm = MinMaxScaler()
    
    #Normalizing Train and Test Data
    X_train_norm = pd.DataFrame(norm.fit_transform(X_train1),columns = X_train1.columns)
    X_test_norm = pd.DataFrame(norm.transform(X_test1),columns = X_test1.columns)
    
    ###########################################################################################

    #Separating the categorical dummy columns because we don't want to pass them throuh StandardScaler.
    X_train_cat = X_train_norm.iloc[:, -197:]
    X_train_cat.reset_index(drop=True, inplace=True)

    X_test_cat = X_test_norm.iloc[:, -197:]
    X_test_cat.reset_index(drop=True, inplace=True)
    
    
    #Separating numerical columns to pass through StandardScaler
    X_train_num = X_train_norm.iloc[:, :538]
    X_train_num.reset_index(drop=True, inplace=True)

    X_test_num = X_test_norm.iloc[:, :538]
    X_test_num.reset_index(drop=True, inplace=True)

    ###########################################################################################

    #Initiating Standard Scaler
    ss= StandardScaler()

    #Standardizing numerical columns
    X_train_num_scaled = pd.DataFrame(ss.fit_transform(X_train_num),columns = X_train_num.columns)
    X_train_num_scaled.reset_index(drop=True, inplace=True)

    X_test_num_scaled = pd.DataFrame(ss.transform(X_test_num),columns = X_test_num.columns)
    X_test_num_scaled.reset_index(drop=True, inplace=True)


    #Combining Scaled Numerical Columns with Categorical Dummy Columns
    X_train_scaled = pd.concat([X_train_num_scaled,X_train_cat],axis=1)
    X_test_scaled = pd.concat([X_test_num_scaled,X_test_cat],axis=1)

    ###########################################################################################

    #Initiating the Linear Regressor
    l2 = Ridge()

    #Fitting the Linear Regressor with Training Data
    l2.fit(X_train_scaled,y_train);

    ###########################################################################################

    cross_val_train = cross_val_score(l2, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
    cross_val_test = cross_val_score(l2, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)

    ###########################################################################################
    
    #Train Results
    cross_val_train_mean = round(-(cross_val_train.mean()),2)
    cross_val_train_max = round(-(cross_val_train.min()),2)
    cross_val_train_min = round(-(cross_val_train.max()),2)

    #Test Results
    cross_val_test_mean = round(-(cross_val_test.mean()),2)
    cross_val_test_max = round(-(cross_val_test.min()),2)
    cross_val_test_min = round(-(cross_val_test.max()),2)

    #Creating a Dataframe to display Validation Results
    results = pd.DataFrame(columns=['Train/Test','Avg RMSE','Max RMSE','Min RMSE'])
    results.loc[0] = ['Train',cross_val_train_mean,cross_val_train_max,cross_val_train_min]
    results.loc[1] = ['Test', cross_val_test_mean, cross_val_test_max, cross_val_test_min]

    return results

In [11]:
attack_l2 = ridge_regression(attack)
midfield_l2 = ridge_regression(midfield)
defence_l2 = ridge_regression(defence)

In [12]:
attack_lr

Unnamed: 0,Train/Test,Avg RMSE,Max RMSE,Min RMSE
0,Train,19813902.24,33709278.12,14167081.56
1,Test,11736934.36,25543992.0,4117587.13


In [13]:
attack_l1

Unnamed: 0,Train/Test,Avg RMSE,Max RMSE,Min RMSE
0,Train,21942048.17,28348415.75,14244968.62
1,Test,13986969.76,25101802.35,6355441.53


In [14]:
attack_l2

Unnamed: 0,Train/Test,Avg RMSE,Max RMSE,Min RMSE
0,Train,15280525.32,20505918.59,11100957.13
1,Test,16063074.58,35345699.03,4362285.71


# Decision Tree Regressor

In [15]:
def decision_tree_regression(df):

    #Creating a dummy dataframe for categorical variable columns
    dummy = []
    for a in [a for a in df.select_dtypes(include='object').drop(['Player'],axis=1).columns]:
        dummy.append(pd.get_dummies(df[a]))

    dummy_df = pd.concat([a for a in dummy],axis=1)

    #Removing Categorical Variable Columns
    df_num = df.select_dtypes(exclude='object')

    #Combining Continuous Variable Columns with Dummy Categorical Variable Columns
    df_final = (pd.concat([df_num,dummy_df],axis=1))

    ###########################################################################################

    #Separating target variable - 'Value'
    X = df_final.drop('Value',axis=1)
    y = df_final['Value']

    #Train-Test Split
    X_train1, X_test1, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
    ###########################################################################################
    
    #Normalization to make Data Distribution more Normal
    
    #Initiating MinMax Scaler 
    norm = MinMaxScaler()
    
    #Normalizing Train and Test Data
    X_train_norm = pd.DataFrame(norm.fit_transform(X_train1),columns = X_train1.columns)
    X_test_norm = pd.DataFrame(norm.transform(X_test1),columns = X_test1.columns)
    
    ###########################################################################################

    #Separating the categorical dummy columns because we don't want to pass them throuh StandardScaler.
    X_train_cat = X_train_norm.iloc[:, -197:]
    X_train_cat.reset_index(drop=True, inplace=True)

    X_test_cat = X_test_norm.iloc[:, -197:]
    X_test_cat.reset_index(drop=True, inplace=True)
    
    
    #Separating numerical columns to pass through StandardScaler
    X_train_num = X_train_norm.iloc[:, :538]
    X_train_num.reset_index(drop=True, inplace=True)

    X_test_num = X_test_norm.iloc[:, :538]
    X_test_num.reset_index(drop=True, inplace=True)

    ###########################################################################################

    #Initiating Standard Scaler
    ss= StandardScaler()

    #Standardizing numerical columns
    X_train_num_scaled = pd.DataFrame(ss.fit_transform(X_train_num),columns = X_train_num.columns)
    X_train_num_scaled.reset_index(drop=True, inplace=True)

    X_test_num_scaled = pd.DataFrame(ss.transform(X_test_num),columns = X_test_num.columns)
    X_test_num_scaled.reset_index(drop=True, inplace=True)


    #Combining Scaled Numerical Columns with Categorical Dummy Columns
    X_train_scaled = pd.concat([X_train_num_scaled,X_train_cat],axis=1)
    X_test_scaled = pd.concat([X_test_num_scaled,X_test_cat],axis=1)

    ###########################################################################################

    #Initiating the Linear Regressor
    DtReg = DecisionTreeRegressor()

    #Fitting the Linear Regressor with Training Data
    DtReg.fit(X_train_scaled,y_train);

    ###########################################################################################

    cross_val_train = cross_val_score(DtReg, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
    cross_val_test = cross_val_score(DtReg, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)

    ###########################################################################################
    
    #Train Results
    cross_val_train_mean = round(-(cross_val_train.mean()),2)
    cross_val_train_max = round(-(cross_val_train.min()),2)
    cross_val_train_min = round(-(cross_val_train.max()),2)

    #Test Results
    cross_val_test_mean = round(-(cross_val_test.mean()),2)
    cross_val_test_max = round(-(cross_val_test.min()),2)
    cross_val_test_min = round(-(cross_val_test.max()),2)

    #Creating a Dataframe to display Validation Results
    results = pd.DataFrame(columns=['Train/Test','Avg RMSE','Max RMSE','Min RMSE'])
    results.loc[0] = ['Train',cross_val_train_mean,cross_val_train_max,cross_val_train_min]
    results.loc[1] = ['Test', cross_val_test_mean, cross_val_test_max, cross_val_test_min]

    return results

In [16]:
attack_dt = decision_tree_regression(attack)
midfield_dt = decision_tree_regression(midfield)
defence_dt = decision_tree_regression(defence)

In [17]:
attack_dt

Unnamed: 0,Train/Test,Avg RMSE,Max RMSE,Min RMSE
0,Train,18318519.37,31590433.45,8521825.76
1,Test,12378780.96,50945088.09,2645525.47


In [18]:
midfield_dt

Unnamed: 0,Train/Test,Avg RMSE,Max RMSE,Min RMSE
0,Train,15505069.61,22625364.39,8782559.68
1,Test,17056411.82,37895536.41,4589117.56


In [19]:
defence_dt

Unnamed: 0,Train/Test,Avg RMSE,Max RMSE,Min RMSE
0,Train,12299364.92,17508981.65,9049760.52
1,Test,12405034.28,19408342.28,4228084.67
