# Notebook 04 - Baseline Model

### Import libraries and data

In [1]:
import sys
sys.path.append('../src')
from paths import TRANSFORMED_DATA_DIR

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
data_transformed = pd.read_csv(TRANSFORMED_DATA_DIR / 'data_transformed.csv')
data_transformed

Unnamed: 0,week,date,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG
0,2,2018-08-24,Getafe,2–0,Eibar,1.4,0.6,Coliseum Alfonso Pérez,David Medié,2.0,0.0,2018,Home win,True,False,False,False,False,False,False,0.0,1.0,0.20,1.30
1,2,2018-08-24,Leganés,2–2,Real Sociedad,1.7,1.6,Estadio Municipal de Butarque,José Luis Munuera,2.0,2.0,2018,Draw,True,False,False,False,False,False,False,1.0,2.0,1.00,0.70
2,2,2018-08-25,Alavés,0–0,Betis,0.6,0.9,Estadio de Mendizorroza,Pablo González,0.0,0.0,2018,Draw,False,False,True,False,False,False,False,0.0,0.0,0.30,0.90
3,2,2018-08-25,Atlético Madrid,1–0,Rayo Vallecano,0.9,1.5,Estadio Wanda Metropolitano,José González,1.0,0.0,2018,Home win,False,False,True,False,False,False,False,1.0,1.0,0.90,2.10
4,2,2018-08-25,Valladolid,0–1,Barcelona,0.4,1.0,Estadio Municipal José Zorrilla,Ricardo de Burgos,0.0,1.0,2018,Away win,False,False,True,False,False,False,False,0.0,3.0,0.00,3.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2258,38,2024-05-25,Real Sociedad,0–2,Atlético Madrid,0.6,2.2,Reale Arena,José Sánchez,0.0,2.0,2023,Away win,False,False,True,False,False,False,False,1.0,1.8,0.72,1.16
2259,38,2024-05-26,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández,1.0,1.0,2023,Draw,False,False,False,True,False,False,False,0.4,1.4,1.08,1.46
2260,38,2024-05-26,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,2.0,2.0,2023,Draw,False,False,False,True,False,False,False,1.4,0.6,1.38,1.66
2261,38,2024-05-26,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García,1.0,2.0,2023,Away win,False,False,False,True,False,False,False,0.6,1.0,1.52,1.08


### Split data into train and test

In [2]:
# Split it into sets based on season_start
train_data = data_transformed[data_transformed['season_start'] <= 2022]
test_data = data_transformed[data_transformed['season_start'] == 2023]

features = [column for column in data_transformed.columns if column not in ['date', 'xG', 'xG_1', 'home', 'away', 'referee', 'venue', 'score', 'result', 'home_goals', 'away_goals', 'season_start']]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (1884, 12)
y_train shape: (1884,)
X_test shape: (379, 12)
y_test shape: (379,)


### Train baseline model

In [3]:
# Train a baseline Random Forest model
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train, y_train)

In [4]:
# Make predictions
predictions = clf.predict(X_test)

In [5]:
# Evaluate model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.4380


In [6]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[ 32  18  56]
 [ 30  16  60]
 [ 23  26 118]]


In [10]:
# Baseline model accuracy
baseline_accuracy = data_transformed['result'].value_counts(normalize=True).max()
print(f'Baseline Accuracy: {baseline_accuracy:.4f}')

Baseline Accuracy: 0.4463


In [11]:
data_transformed['result'].value_counts(normalize=True)

result
Home win    0.446310
Away win    0.277066
Draw        0.276624
Name: proportion, dtype: float64

#### Improving the baseline model with hyperparameter tuning using Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [13]:
grid_search.best_params_

{'max_depth': 5, 'n_estimators': 50}

In [14]:
# Train the Random Forest model with the best hyperparameters
clf = RandomForestClassifier(random_state=1, n_estimators=200, max_depth=5)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.47757255936675463

In [15]:
data_transformed_new_features = pd.get_dummies(data_transformed, columns=['home', 'away', 'referee', 'venue'])
data_transformed_new_features.head()

Unnamed: 0,week,date,score,xG,xG_1,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG,home_Alavés,home_Almería,home_Athletic Club,home_Atlético Madrid,home_Barcelona,home_Betis,home_Celta Vigo,home_Cádiz,home_Eibar,home_Elche,home_Espanyol,home_Getafe,home_Girona,home_Granada,home_Huesca,home_Las Palmas,home_Leganés,home_Levante,home_Mallorca,home_Osasuna,home_Rayo Vallecano,home_Real Madrid,home_Real Sociedad,home_Sevilla,home_Valencia,home_Valladolid,home_Villarreal,away_Alavés,away_Almería,away_Athletic Club,away_Atlético Madrid,away_Barcelona,away_Betis,away_Celta Vigo,away_Cádiz,away_Eibar,away_Elche,away_Espanyol,away_Getafe,away_Girona,away_Granada,away_Huesca,away_Las Palmas,away_Leganés,away_Levante,away_Mallorca,away_Osasuna,away_Rayo Vallecano,away_Real Madrid,away_Real Sociedad,away_Sevilla,away_Valencia,away_Valladolid,away_Villarreal,referee_Adrián Cordero,referee_Alberto Undiano,referee_Alejandro Hernández,referee_Alejandro Muñíz,referee_Antonio Matéu,referee_Antonio Matéu Lahoz,referee_Carlos del Cerro,referee_César Soto,referee_Daniel Ask,referee_David Medié,referee_Eduardo Prieto,referee_Francisco Hernández,referee_Guillermo Cuadra,referee_Hsu Jason,referee_Ignacio Iglesias,referee_Isidro Díaz de Mera,referee_Javier Alberola,referee_Javier Villanueva,referee_Jesús Gil,referee_Jorge Figueroa,referee_José González,referee_José Luis Munuera,referee_José Sánchez,referee_Juan Martínez,referee_Juan Pulido,referee_Mario Melero,referee_Mateo Busquets,referee_Miguel Ángel Ortiz Arias,referee_Pablo González,referee_Ricardo de Burgos,referee_Santiago Jaime,referee_Valentín Pizarro,referee_Víctor García,venue_Camp Nou,venue_Coliseum Alfonso Pérez,venue_Estadi Mallorca Son Moix,venue_Estadi Municipal de Montilivi,venue_Estadi Olímpic Lluís Companys,venue_Estadio Abanca Balaídos,venue_Estadio Alfredo Di Stéfano,venue_Estadio Benito Villamarín,venue_Estadio Ciudad de Valencia,venue_Estadio Cívitas Metropolitano,venue_Estadio El Alcoraz,venue_Estadio El Sadar,venue_Estadio Manuel Martínez Valero,venue_Estadio Municipal José Zorrilla,venue_Estadio Municipal de Anoeta,venue_Estadio Municipal de Butarque,venue_Estadio Municipal de Ipurúa,venue_Estadio Nuevo Los Cármenes,venue_Estadio Nuevo Mirandilla,venue_Estadio Ramón Sánchez Pizjuán,venue_Estadio Ramón de Carranza,venue_Estadio San Mamés,venue_Estadio Santiago Bernabéu,venue_Estadio Wanda Metropolitano,venue_Estadio de Balaídos,venue_Estadio de Gran Canaria,venue_Estadio de Mendizorroza,venue_Estadio de Mestalla,venue_Estadio de la Cerámica,venue_Estadio del Rayo Vallecano,venue_Iberostar Estadi,venue_Power Horse Stadium,venue_RCDE Stadium,venue_Reale Arena,venue_San Mamés
0,2,2018-08-24,2–0,1.4,0.6,2.0,0.0,2018,Home win,True,False,False,False,False,False,False,0.0,1.0,0.2,1.3,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2,2018-08-24,2–2,1.7,1.6,2.0,2.0,2018,Draw,True,False,False,False,False,False,False,1.0,2.0,1.0,0.7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2,2018-08-25,0–0,0.6,0.9,0.0,0.0,2018,Draw,False,False,True,False,False,False,False,0.0,0.0,0.3,0.9,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,2,2018-08-25,1–0,0.9,1.5,1.0,0.0,2018,Home win,False,False,True,False,False,False,False,1.0,1.0,0.9,2.1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
4,2,2018-08-25,0–1,0.4,1.0,0.0,1.0,2018,Away win,False,False,True,False,False,False,False,0.0,3.0,0.0,3.2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [16]:
# Define the features and target variable
features = [column for column in data_transformed_new_features.drop(columns=['date', 'xG', 'xG_1', 'score', 'result', 'home_goals', 'away_goals', 'season_start']).columns]

# Split the data into train and test sets
train_data = data_transformed_new_features[data_transformed_new_features['season_start'] <= 2022]
test_data = data_transformed_new_features[data_transformed_new_features['season_start'] == 2023]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

In [17]:
# Find the best hyperparameters for the Random Forest model
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

grid_search.best_params_

{'max_depth': 10, 'n_estimators': 100}

In [18]:
# Train a Random Forest model with the new features
clf = RandomForestClassifier(random_state=1, n_estimators=100, max_depth=10)
clf.fit(X_train, y_train)

# Make predictions
predictions = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
accuracy

0.5171503957783641

The accuracy of the model has increased after adding the features we ignored at first.
The process of improving the model can be very iterative, it can be improved by:

- Adding more features.
- Tuning the hyperparameters.
- Using different models.

Next we'll see if we can improve the model by using different models but obtaining the best hyperparameters with Optuna.