In [2]:
import sqlite3
import pandas as pd
import numpy as np
import time
import sys
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from joblib import dump, load

In [3]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

y_train = y_train["overall_rating"].ravel()
y_test = y_test["overall_rating"].ravel()

In [4]:
print(X_train.info())
print(X_test.info())
print(y_train.info())
print(y_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146513 entries, 0 to 146512
Data columns (total 40 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   preferred_foot_left         146513 non-null  float64
 1   preferred_foot_right        146513 non-null  float64
 2   defensive_work_rate_high    146513 non-null  float64
 3   defensive_work_rate_low     146513 non-null  float64
 4   defensive_work_rate_medium  146513 non-null  float64
 5   attacking_work_rate_high    146513 non-null  float64
 6   attacking_work_rate_low     146513 non-null  float64
 7   attacking_work_rate_medium  146513 non-null  float64
 8   crossing                    146513 non-null  float64
 9   finishing                   146513 non-null  float64
 10  heading_accuracy            146513 non-null  float64
 11  short_passing               146513 non-null  float64
 12  volleys                     146513 non-null  float64
 13  dribbling     

AttributeError: 'numpy.ndarray' object has no attribute 'info'

# Evaluation Metrics 
## 1. Mean Squared Error (MSE)
This is the average squared difference between the estimated values (predicted values) and the actual values (true values). IT is high sensitivity to outliers. In football, outliers can often represent superstar players or unusual talents, MSE helps to highlight if these players. 
## 2. R-squared (Coefficient of Determination)
The proportion of variance in the prediction. Essentially, how well observed outcomes are replicated by the model. A high R-squared means our model accounts well for the factors (like 'crossing', 'finishing', 'dribbling', etc.) that determine a player’s overall performance.
## 3. Mean Absolute Error (MAE)
A linear score that reflects the typical prediction error. This is straightforward for football clubs to understand and use eg. MAE of 2.0 means that typically, the model's player ratings are within two points of their true rating.
## 4. Explained Variance Score
Metric used to captures the variance or fluctuations in player ratings, regardless of whether there is a systematic bias (like consistently predicting higher or lower) unlike R-squared which focuses on predictive power directly without any offsets. 

# Linear Regression

In [25]:
linear_reg = LinearRegression()

start_time = time.time()
linear_reg.fit(X_train, y_train)

# Predict with Linear Regression
lr_pred = linear_reg.predict(X_test)

# Evaluate Linear Regression
lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)
lr_explained_variance = explained_variance_score(y_test, lr_pred)
lr_time = time.time() - start_time

print(f"Mean Squared Error: {lr_mse:.4f}")
print(f"R-squared: {lr_r2:.4f}")
print(f"Mean Absolute Error: {lr_mae:.4f}")
print(f"Explained Variance Score: {lr_explained_variance:.4f}")
print(f"Total time taken: {lr_time} seconds")


Mean Squared Error: 10.5375
R-squared: 0.7886
Mean Absolute Error: 2.5010
Explained Variance Score: 0.7886
Total time taken: 0.3300049304962158 seconds


In [12]:
print(X_test.iloc[0])

preferred_foot_left           1.000000
preferred_foot_right          0.000000
defensive_work_rate_high      0.000000
defensive_work_rate_low       0.000000
defensive_work_rate_medium    1.000000
attacking_work_rate_high      0.000000
attacking_work_rate_low       0.000000
attacking_work_rate_medium    1.000000
crossing                      2.140873
finishing                     0.634442
heading_accuracy              0.711631
short_passing                 1.096963
volleys                       1.791112
dribbling                     1.286295
free_kick_accuracy            2.053593
long_passing                  0.967744
ball_control                  0.829863
acceleration                  0.950502
sprint_speed                  0.791488
agility                       0.933356
reactions                     0.862475
balance                       0.908777
shot_power                    0.693617
jumping                      -0.819079
stamina                       0.528776
strength                 

# Random Forest

In [5]:
random_forest = RandomForestRegressor()

start_time = time.time()
random_forest.fit(X_train, y_train)

# Predict with Random Forest
rf_pred = random_forest.predict(X_test)

# Evaluate Random Forest
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_explained_variance = explained_variance_score(y_test, rf_pred)
rf_time = time.time() - start_time

print(f"Mean Squared Error: {rf_mse:.4f}")
print(f"R-squared: {rf_r2:.4f}")
print(f"Mean Absolute Error: {rf_mae:.4f}")
print(f"Explained Variance Score: {rf_explained_variance:.4f}")
print(f"Total time taken : {rf_time} seconds")


Mean Squared Error: 0.8493
R-squared: 0.9830
Mean Absolute Error: 0.5006
Explained Variance Score: 0.9830
Total time taken : 350.6340398788452 seconds


# Support Vector Machine

In [23]:
svm_reg = SVR()

start_time = time.time()
svm_reg.fit(X_train, y_train)

# Predict with SVM
svm_pred = svm_reg.predict(X_test)

# Evaluate SVM
svm_mse = mean_squared_error(y_test, svm_pred)
svm_r2 = r2_score(y_test, svm_pred)
svm_mae = mean_absolute_error(y_test, svm_pred)
svm_explained_variance = explained_variance_score(y_test, svm_pred)
svm_time = time.time() - start_time

print(f"Mean Squared Error: {svm_mse:.4f}")
print(f"R-squared: {svm_r2:.4f}")
print(f"Mean Absolute Error: {svm_mae:.4f}")
print(f"Explained Variance Score: {svm_explained_variance:.4f}")
print(f"Total time taken : {svm_time} seconds")


Mean Squared Error: 1.3952
R-squared: 0.9720
Mean Absolute Error: 0.7564
Explained Variance Score: 0.9721
Total time taken : 6639.946455478668 seconds


In [7]:
dump(random_forest, 'predictionmodel.joblib')

['predictionmodel.joblib']