In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
df

Unnamed: 0,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,...,defensive_work_rate_8,defensive_work_rate_9,defensive_work_rate_ean,defensive_work_rate_es,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium,defensive_work_rate_o,defensive_work_rate_ormal,defensive_work_rate_tocky
0,67.0,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,...,0,0,0,0,0,0,1,0,0,0
1,67.0,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,...,0,0,0,0,0,0,1,0,0,0
2,62.0,66.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,...,0,0,0,0,0,0,1,0,0,0
3,61.0,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,...,0,0,0,0,0,0,1,0,0,0
4,61.0,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180349,83.0,85.0,84.0,77.0,59.0,89.0,77.0,84.0,86.0,78.0,...,0,0,0,0,0,1,0,0,0,0
180350,78.0,80.0,74.0,76.0,53.0,84.0,77.0,85.0,86.0,74.0,...,0,0,0,0,0,1,0,0,0,0
180351,77.0,80.0,74.0,71.0,53.0,84.0,77.0,85.0,86.0,74.0,...,0,0,0,0,0,1,0,0,0,0
180352,78.0,81.0,74.0,64.0,57.0,86.0,77.0,87.0,86.0,73.0,...,0,0,0,0,0,1,0,0,0,0


In [4]:
# Model with scaling the data

In [5]:
X = df.drop(columns=["overall_rating"],axis=1)
y = df['overall_rating']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120837, 59)
(59517, 59)
(120837,)
(59517,)


In [9]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)).mean()
    return rmse
    

def evaluation(y, predictions):
    mae = mean_absolute_error(y, predictions)
    mse = mean_squared_error(y, predictions)
    rmse = np.sqrt(mean_squared_error(y, predictions))
    r_squared = r2_score(y, predictions)
    return mae, mse, rmse, r_squared

In [10]:
models = pd.DataFrame(columns=["Model","MAE","MSE","RMSE","R2 Score","RMSE (Cross-Validation)"])

In [11]:
X_train.head()

Unnamed: 0,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,...,defensive_work_rate_8,defensive_work_rate_9,defensive_work_rate_ean,defensive_work_rate_es,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium,defensive_work_rate_o,defensive_work_rate_ormal,defensive_work_rate_tocky
93959,78.0,60.0,65.0,84.0,62.0,29.0,42.0,30.0,43.0,62.0,...,0,0,0,0,0,0,1,0,0,0
98627,81.0,46.0,64.0,60.0,60.0,52.0,65.0,50.0,42.0,52.0,...,0,0,0,0,0,0,1,0,0,0
2222,73.0,59.0,31.0,68.0,61.0,35.0,47.0,31.0,36.0,49.0,...,0,0,0,0,0,0,1,0,0,0
72245,77.0,51.0,59.0,65.0,69.0,55.0,64.0,61.0,54.0,73.0,...,0,0,0,0,0,0,1,0,0,0
134132,76.0,62.0,56.0,52.0,59.0,57.0,72.0,79.0,36.0,56.0,...,0,0,0,0,0,0,1,0,0,0


## Scaling Data

In [12]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_train=pd.DataFrame(X_train,columns=X.columns)
X_train.head()

Unnamed: 0,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,...,defensive_work_rate_8,defensive_work_rate_9,defensive_work_rate_ean,defensive_work_rate_es,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium,defensive_work_rate_o,defensive_work_rate_ormal,defensive_work_rate_tocky
0,0.684092,0.282434,0.789893,1.61879,-0.032681,-1.117642,-0.969049,-1.255351,-0.354578,0.340308,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289
1,1.140488,-0.527102,0.737462,0.165806,-0.17327,0.138459,0.324325,-0.161796,-0.410566,-0.351607,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289
2,-0.076568,0.22461,-0.992763,0.650134,-0.102975,-0.789963,-0.687881,-1.200673,-0.746495,-0.559181,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289
3,0.53196,-0.237982,0.475306,0.468511,0.45938,0.302298,0.268091,0.43966,0.261293,1.101414,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289
4,0.379828,0.398082,0.318013,-0.318523,-0.243564,0.411524,0.71796,1.423859,-0.746495,-0.074841,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289


In [13]:
X_test=scaler.transform(X_test)
X_test=pd.DataFrame(X_test,columns=X.columns)

In [14]:
X_test.head()

Unnamed: 0,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,...,defensive_work_rate_8,defensive_work_rate_9,defensive_work_rate_ean,defensive_work_rate_es,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium,defensive_work_rate_o,defensive_work_rate_ormal,defensive_work_rate_tocky
0,-1.597887,-1.683582,-1.097625,0.286888,-2.000925,-0.844576,-1.868787,-1.03664,-1.306377,-1.873819,...,-0.020141,-0.029631,-0.020346,-0.022102,2.380982,-0.336684,-1.627714,-0.085011,-0.042219,-0.022289
1,1.596883,-2.030526,-1.674367,0.710675,-0.946509,-1.336094,-1.700086,-1.419384,-1.586318,-0.97433,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289
2,2.509675,1.496738,1.15691,-0.1369,1.373208,1.613011,1.224063,1.423859,0.541234,0.824648,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289
3,1.140488,1.034146,0.475306,-0.439605,1.09203,1.066881,0.999128,1.041115,0.877163,1.308989,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289
4,-1.445755,0.80285,-0.101435,-0.015818,0.107908,0.903042,0.155624,0.603693,0.877163,0.547883,...,-0.020141,-0.029631,-0.020346,-0.022102,-0.419995,-0.336684,0.614359,-0.085011,-0.042219,-0.022289


In [15]:
from pickle import dump
dump(scaler,open('../scaler.pkl','wb'))

## Linear Regression

In [16]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
predictions = lin_reg.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(lin_reg)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "LinearRegression","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

MAE: 2.120756538646383
MSE: 7.723807850286692
RMSE: 2.77917395106652
R2 Score: 0.8444093040311428
------------------------------
RMSE Cross-Validation: 2.7831552375605013


## Ridge Regression

In [17]:
ridge = Ridge()
ridge.fit(X_train, y_train)
predictions = ridge.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(ridge)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "Ridge","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

MAE: 2.1206790531678563
MSE: 7.723903358308285
RMSE: 2.7791911338208255
R2 Score: 0.8444073800889839
------------------------------
RMSE Cross-Validation: 2.7830920757103557


## Lasso Regression

In [18]:
lasso = Lasso()
lasso.fit(X_train, y_train)
predictions = lasso.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(lasso)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "Lasso","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

MAE: 2.771303936682735
MSE: 13.383294100636377
RMSE: 3.6583184799353345
R2 Score: 0.7304029199280735
------------------------------
RMSE Cross-Validation: 2.835982496155512


## ElasticNet

In [19]:
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
predictions = elastic_net.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(elastic_net)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "ElasticNet","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

MAE: 2.7809771103854097
MSE: 13.257170253850429
RMSE: 3.6410397215425196
R2 Score: 0.7329435964285858
------------------------------
RMSE Cross-Validation: 2.8176096293384503


## Support Vector Machine

In [20]:
svr = SVR()
svr.fit(X_train, y_train)
predictions = svr.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(svr)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "SVR","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

## Random Forest Regression

In [None]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(random_forest)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "RandomForestRegressor","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

## XGBoost Regressor

In [None]:
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)
print("-"*30)
rmse_cross_val = rmse_cv(xgb)
print("RMSE Cross-Validation:", rmse_cross_val)

new_row = {"Model": "XGBRegressor","MAE": mae, "MSE": mse, "RMSE": rmse, "R2 Score": r_squared, "RMSE (Cross-Validation)": rmse_cross_val}
models = models.append(new_row, ignore_index=True)

## Model Comparison

In [None]:
models.sort_values(by="RMSE (Cross-Validation)")

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x=models["Model"], y=models["RMSE (Cross-Validation)"])
plt.title("Models' RMSE Scores (Cross-Validated)", size=15)
plt.xticks(rotation=30, size=12)
plt.show()