In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv("../Week-4_Feature_Scaling/minmax_scaled_epl.csv")
df.head()


Unnamed: 0,Name,Jersey Number,Nationality,Age,Appearances,Wins,Losses,Goals,Goals per match,Headed goals,...,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides,Club_encoded,Position_encoded
0,Bernd Leno,1.0,Germany,28.0,0.1171,28,16,0.068149,,,...,17.0,28.0,375.0,489.0,2,0,0,,0,2
1,Matt Macey,33.0,England,26.0,0.150001,0,0,0.068149,,,...,0.0,0.0,0.0,0.0,0,0,0,,0,2
2,Rúnar Alex Rúnarsson,13.0,Iceland,25.0,0.150001,0,0,0.068149,,,...,0.0,0.0,0.0,0.0,0,0,0,,0,2
3,Héctor Bellerín,2.0,Spain,25.0,0.295539,90,37,0.03352,,0.0,...,,,,,23,0,125,8.0,0,0
4,Kieran Tierney,3.0,Scotland,23.0,0.027881,7,5,0.0,,0.0,...,,,,,2,0,9,0.0,0,0


In [2]:
X = df[['Appearances', 'Shots', 'Passes', 'Assists']]
y = df['Goals']


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [4]:
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)
rf.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [5]:
y_pred_rf = rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf) ** 0.5
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest MAE:", mae_rf)
print("Random Forest RMSE:", rmse_rf)
print("Random Forest R2:", r2_rf)


Random Forest MAE: 0.020961157438305004
Random Forest RMSE: 0.03323933733763887
Random Forest R2: 0.6985636000979295


In [6]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}

grid_rf = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring='r2'
)

grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_



In [7]:
y_pred_best = best_rf.predict(X_test)

print("Tuned RF R2:", r2_score(y_test, y_pred_best))


Tuned RF R2: 0.7031845627822346


In [8]:
df['Match_Result'] = df['Goals'].apply(
    lambda x: 2 if x > 1 else (1 if x == 1 else 0)
)


In [9]:
df['Match_Result'] = df['Goals'].apply(
    lambda x: 2 if x > 1 else (1 if x == 1 else 0)
)


In [11]:
# Features and target for classification
X_cls = df[['Shots', 'Passes', 'Assists']]
y_cls = df['Match_Result']


In [12]:
from sklearn.model_selection import train_test_split

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)


In [13]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(Xc_train, yc_train)


0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [14]:
yc_pred = gb.predict(Xc_test)

print("Accuracy:", accuracy_score(yc_test, yc_pred))
print("Precision:", precision_score(yc_test, yc_pred, average='weighted'))
print("Recall:", recall_score(yc_test, yc_pred, average='weighted'))
print("F1 Score:", f1_score(yc_test, yc_pred, average='weighted'))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [15]:
import joblib

joblib.dump(best_rf, "rf_regression_model.pkl")
joblib.dump(gb, "gb_classification_model.pkl")


['gb_classification_model.pkl']