# Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score

# Dataset Prep

In [2]:
df = pd.read_csv('EPL_Set.csv')
df = df.dropna()

## Encode team names using Label Encoding (HomeTeam and AwayTeam)


In [3]:
le = LabelEncoder()
df['HomeTeam'] = le.fit_transform(df['HomeTeam'])
df['AwayTeam'] = le.fit_transform(df['AwayTeam'])

In [4]:
le_result = LabelEncoder()
df['FTR'] = le_result.fit_transform(df['FTR'])  # 'H' -> 1, 'D' -> 0, 'A' -> 2
df['HTR'] = le_result.fit_transform(df['HTR'])  # 'H' -> 1, 'D' -> 0, 'A' -> 2

In [5]:
# Adding new features
df['GoalDiffHome'] = df['FTHG'] - df['FTAG']  # Goal difference at full time
df['GoalDiffHalf'] = df['HTHG'] - df['HTAG']  # Goal difference at half time

# Adding rolling average of goals in the last 5 matches
df['HomeGoalsLast5'] = df.groupby('HomeTeam')['FTHG'].shift().rolling(5).mean()
df['AwayGoalsLast5'] = df.groupby('AwayTeam')['FTAG'].shift().rolling(5).mean()

## Define features and target variable

In [6]:
X = df[['HomeTeam', 'AwayTeam', 'GoalDiffHome', 'GoalDiffHalf', 'HomeGoalsLast5', 'AwayGoalsLast5']]
y = df[['FTHG', 'FTAG', 'HTHG', 'HTAG', 'FTR', 'HTR']]

In [7]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Season,GoalDiffHome,GoalDiffHalf,HomeGoalsLast5,AwayGoalsLast5
924,E0,19/08/95,1,26,3,1,2,3.0,0.0,2,1995-96,2,3.0,,
925,E0,19/08/95,4,33,1,0,2,1.0,0.0,2,1995-96,1,1.0,,
926,E0,19/08/95,13,17,0,0,1,0.0,0.0,1,1995-96,0,0.0,,
927,E0,19/08/95,24,36,1,0,2,0.0,0.0,1,1995-96,1,0.0,,
928,E0,19/08/95,25,41,1,1,1,0.0,1.0,0,1995-96,0,-1.0,,


# Model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

from sklearn.impute import SimpleImputer

# Impute missing values by filling with the mean
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [19]:
model = MultiOutputRegressor(RandomForestRegressor(random_state=1))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Extract predicted values for each target


In [20]:
y_pred_fthg = y_pred[:, 0]  # Predicted Full-Time Home Goals
y_pred_ftag = y_pred[:, 1]  # Predicted Full-Time Away Goals
y_pred_hthg = y_pred[:, 2]  # Predicted Half-Time Home Goals
y_pred_htag = y_pred[:, 3]  # Predicted Half-Time Away Goals
y_pred_ftr = y_pred[:, 4]   # Predicted Full-Time Result
y_pred_htr = y_pred[:, 5]   # Predicted Half-Time Result

## Evaluate the regression targets (FTHG, FTAG, HTHG, HTAG) using MSE


In [21]:
mse_fthg = mean_squared_error(y_test['FTHG'], y_pred_fthg)
mse_ftag = mean_squared_error(y_test['FTAG'], y_pred_ftag)
mse_hthg = mean_squared_error(y_test['HTHG'], y_pred_hthg)
mse_htag = mean_squared_error(y_test['HTAG'], y_pred_htag)

In [22]:
print(f'Mean Squared Error for FTHG: {mse_fthg}')
print(f'Mean Squared Error for FTAG: {mse_ftag}')
print(f'Mean Squared Error for HTHG: {mse_hthg}')
print(f'Mean Squared Error for HTAG: {mse_htag}')

Mean Squared Error for FTHG: 0.5262170043055362
Mean Squared Error for FTAG: 0.5222589178455325
Mean Squared Error for HTHG: 0.19622780098765288
Mean Squared Error for HTAG: 0.19691513952745476


In [23]:
y_pred_ftr = y_pred_ftr.round().astype(int)
y_pred_htr = y_pred_htr.round().astype(int)
accuracy_ftr = accuracy_score(y_test['FTR'], y_pred_ftr)
accuracy_htr = accuracy_score(y_test['HTR'], y_pred_htr)


In [24]:
print(f'Accuracy for Full-Time Result (FTR): {accuracy_ftr}')
print(f'Accuracy for Half-Time Result (HTR): {accuracy_htr}')

Accuracy for Full-Time Result (FTR): 1.0
Accuracy for Half-Time Result (HTR): 1.0
