In [2]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

df = pd.read_csv(
    "/content/drive/MyDrive/infosys internship/dataset - 2020-09-24.csv"
)

df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,34.0,26.0,17.0,28.0,375.0,489.0,2,0,0,
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,,,,,,,23,0,125,8.0
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,,,,,,,2,0,9,0.0


In [3]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [12]:
# Features and target
X = df[['Appearances', 'Shots', 'Passes', 'Assists']]
y = df['Goals']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf.predict(X_test)

In [15]:
import numpy as np
# Evaluation
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
# Calculate MSE first, then take square root for RMSE
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
print("RMSE:", rmse_rf)
print("R2:", r2_score(y_test, y_pred_rf))

MAE: 2.3315652173913044
RMSE: 4.5881635049141485
R2: 0.8586311800017874


In [16]:
#Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}

grid_rf = GridSearchCV(RandomForestRegressor(random_state=42),
                       param_grid, cv=3, scoring='r2')

grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

y_pred_best = best_rf.predict(X_test)
print("Tuned RF R2:", r2_score(y_test, y_pred_best))


Tuned RF R2: 0.854337836690892


In [19]:
#Fill NaNs (Imputation)- Since these are numeric features, you can replace NaNs with the mean or median:

X_cls = df[['Shots', 'Passes', 'Assists']].copy()

# Fill missing values with mean
X_cls.fillna(X_cls.mean(), inplace=True)

y_cls = df['Match_Result']


In [20]:
# run the train-test split and training
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42
)

gb = GradientBoostingClassifier(random_state=42)
gb.fit(Xc_train, yc_train)
yc_pred = gb.predict(Xc_test)

print("Accuracy:", accuracy_score(yc_test, yc_pred))
print("Precision:", precision_score(yc_test, yc_pred, average='weighted'))
print("Recall:", recall_score(yc_test, yc_pred, average='weighted'))
print("F1 Score:", f1_score(yc_test, yc_pred, average='weighted'))


Accuracy: 0.7913043478260869
Precision: 0.7756219192482485
Recall: 0.7913043478260869
F1 Score: 0.7814688502155616


In [25]:

#SAVE AND DOWNLOAD THE MODELS
from google.colab import files
import os

# Path to your saved models
folder_path = "/content/drive/MyDrive/infosys internship"

# Files
rf_model_path = os.path.join(folder_path, "rf_regression_model.pkl")
gb_model_path = os.path.join(folder_path, "gb_classification_model.pkl")

# Download
files.download(rf_model_path)
files.download(gb_model_path)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>