# Notebook 05 - Model Optimization with Optuna

In [58]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../src')
from paths import TRANSFORMED_DATA_DIR

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

import optuna
from optuna_integration.xgboost import XGBoostPruningCallback

import xgboost as xgb
import lightgbm as lgb

In [56]:
# Load data
pd.set_option('display.max_columns', None)
data_transformed = pd.read_csv(TRANSFORMED_DATA_DIR / 'data_transformed.csv')
data_transformed

Unnamed: 0,week,date,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG
0,2,2018-08-24,Getafe,2–0,Eibar,1.4,0.6,Coliseum Alfonso Pérez,David Medié,2.0,0.0,2018,Home win,True,False,False,False,False,False,False,0.0,1.0,0.20,1.30
1,2,2018-08-24,Leganés,2–2,Real Sociedad,1.7,1.6,Estadio Municipal de Butarque,José Luis Munuera,2.0,2.0,2018,Draw,True,False,False,False,False,False,False,1.0,2.0,1.00,0.70
2,2,2018-08-25,Alavés,0–0,Betis,0.6,0.9,Estadio de Mendizorroza,Pablo González,0.0,0.0,2018,Draw,False,False,True,False,False,False,False,0.0,0.0,0.30,0.90
3,2,2018-08-25,Atlético Madrid,1–0,Rayo Vallecano,0.9,1.5,Estadio Wanda Metropolitano,José González,1.0,0.0,2018,Home win,False,False,True,False,False,False,False,1.0,1.0,0.90,2.10
4,2,2018-08-25,Valladolid,0–1,Barcelona,0.4,1.0,Estadio Municipal José Zorrilla,Ricardo de Burgos,0.0,1.0,2018,Away win,False,False,True,False,False,False,False,0.0,3.0,0.00,3.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2258,38,2024-05-25,Real Sociedad,0–2,Atlético Madrid,0.6,2.2,Reale Arena,José Sánchez,0.0,2.0,2023,Away win,False,False,True,False,False,False,False,1.0,1.8,0.72,1.16
2259,38,2024-05-26,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández,1.0,1.0,2023,Draw,False,False,False,True,False,False,False,0.4,1.4,1.08,1.46
2260,38,2024-05-26,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,2.0,2.0,2023,Draw,False,False,False,True,False,False,False,1.4,0.6,1.38,1.66
2261,38,2024-05-26,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García,1.0,2.0,2023,Away win,False,False,False,True,False,False,False,0.6,1.0,1.52,1.08


In [57]:
data_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2263 entries, 0 to 2262
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   week                    2263 non-null   int64  
 1   date                    2263 non-null   object 
 2   home                    2263 non-null   object 
 3   score                   2263 non-null   object 
 4   away                    2263 non-null   object 
 5   xG                      2263 non-null   float64
 6   xG_1                    2263 non-null   float64
 7   venue                   2263 non-null   object 
 8   referee                 2263 non-null   object 
 9   home_goals              2263 non-null   float64
 10  away_goals              2263 non-null   float64
 11  season_start            2263 non-null   int64  
 12  result                  2263 non-null   object 
 13  day_Friday              2263 non-null   bool   
 14  day_Monday              2263 non-null   

In [3]:
# Split into train and test sets based on season start
train_data = data_transformed[data_transformed['season_start'] <= 2022]
test_data = data_transformed[data_transformed['season_start'] == 2023]

features = [column for column in data_transformed.columns if column not in ['date', 'xG', 'xG_1', 'home', 'away', 'referee', 'venue', 'score', 'result', 'home_goals', 'away_goals', 'season_start']]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (1884, 12)
y_train shape: (1884,)
X_test shape: (379, 12)
y_test shape: (379,)


In [4]:
# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [5]:
#Function for defining the model pipeline
def get_pipeline(model_type="xgboost", **hyperparams):
    if model_type == "xgboost":
        model = xgb.XGBClassifier(**hyperparams, use_label_encoder=False, eval_metric='mlogloss')
    elif model_type == "lightgbm":
        model = lgb.LGBMClassifier(**hyperparams)
    else:
        raise ValueError("Unsupported model type. Choose 'xgboost' or 'lightgbm'.")
    
    return model

In [8]:
#Objective function for Optuna
def objective(trial, model_type="xgboost"):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    
    if model_type == "xgboost":
        model = xgb.XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss')
    elif model_type == "lightgbm":
        model = lgb.LGBMClassifier(**params)
    
    skf = StratifiedKFold(n_splits=3)
    scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train_encoded):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_encoded[train_idx], y_train_encoded[val_idx]
        
        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)
        score = accuracy_score(y_val_fold, preds)
        scores.append(score)
    
    return np.mean(scores)

In [9]:
# Parameters optimization for XGBoost
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(lambda trial: objective(trial, model_type="xgboost"), n_trials=5)
best_params_xgb = study_xgb.best_trial.params
print(f'Best parameters for XGBoost: {best_params_xgb}')

# Parameters optimization for LightGBM
study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(lambda trial: objective(trial, model_type="lightgbm"), n_trials=5)
best_params_lgbm = study_lgbm.best_trial.params
print(f'Best parameters for LightGBM: {best_params_lgbm}')

[I 2024-08-13 08:05:45,061] A new study created in memory with name: no-name-fb573050-1db0-49d8-aea8-4d045aafc0d2
[I 2024-08-13 08:05:46,622] Trial 0 finished with value: 0.4150743099787686 and parameters: {'n_estimators': 79, 'max_depth': 10, 'learning_rate': 0.05447147969282871, 'subsample': 0.6140164829196976, 'colsample_bytree': 0.9655335346332299}. Best is trial 0 with value: 0.4150743099787686.
[I 2024-08-13 08:05:49,285] Trial 1 finished with value: 0.41188959660297236 and parameters: {'n_estimators': 157, 'max_depth': 9, 'learning_rate': 0.039488494819035644, 'subsample': 0.7075831977553658, 'colsample_bytree': 0.926363744248963}. Best is trial 0 with value: 0.4150743099787686.
[I 2024-08-13 08:05:51,324] Trial 2 finished with value: 0.40498938428874737 and parameters: {'n_estimators': 160, 'max_depth': 10, 'learning_rate': 0.12175233755181632, 'subsample': 0.7546631274104418, 'colsample_bytree': 0.8626402888530498}. Best is trial 0 with value: 0.4150743099787686.
[I 2024-08-13

Best parameters for XGBoost: {'n_estimators': 79, 'max_depth': 10, 'learning_rate': 0.05447147969282871, 'subsample': 0.6140164829196976, 'colsample_bytree': 0.9655335346332299}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 322
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training

[I 2024-08-13 08:05:56,141] Trial 0 finished with value: 0.42091295116772826 and parameters: {'n_estimators': 116, 'max_depth': 4, 'learning_rate': 0.0710773114602441, 'subsample': 0.7161749628883616, 'colsample_bytree': 0.6797728927502955}. Best is trial 0 with value: 0.42091295116772826.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 322
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wi

[I 2024-08-13 08:05:57,015] Trial 1 finished with value: 0.3949044585987261 and parameters: {'n_estimators': 181, 'max_depth': 6, 'learning_rate': 0.053951959337111347, 'subsample': 0.8711024743099818, 'colsample_bytree': 0.9433507350356806}. Best is trial 0 with value: 0.42091295116772826.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 322
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing row-wi

[I 2024-08-13 08:05:58,003] Trial 2 finished with value: 0.39118895966029726 and parameters: {'n_estimators': 138, 'max_depth': 5, 'learning_rate': 0.17556220487079102, 'subsample': 0.7066569930924711, 'colsample_bytree': 0.7059166382726338}. Best is trial 0 with value: 0.42091295116772826.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 322
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185


[I 2024-08-13 08:05:58,502] Trial 3 finished with value: 0.38322717622080676 and parameters: {'n_estimators': 186, 'max_depth': 4, 'learning_rate': 0.2552986962948357, 'subsample': 0.7972505751749638, 'colsample_bytree': 0.6896607502477107}. Best is trial 0 with value: 0.42091295116772826.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 309
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000035 seconds

[I 2024-08-13 08:05:59,121] Trial 4 finished with value: 0.4071125265392781 and parameters: {'n_estimators': 83, 'max_depth': 7, 'learning_rate': 0.07806383301394001, 'subsample': 0.667990999831088, 'colsample_bytree': 0.6126144897862742}. Best is trial 0 with value: 0.42091295116772826.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 309
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 12
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
Best parameters for LightGBM: {'n_estimators': 116, 'max_depth': 4, 'learning_rate': 0.0710773114602441, 'subsample': 0.7161749628883616, 'colsample_bytree': 0.6797728927502955}


In [10]:
pipeline = get_pipeline(**best_params_xgb)
pipeline.fit(X_train, y_train_encoded)

In [20]:
predictions = pipeline.predict(X_test)

In [21]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test_encoded, predictions, average='weighted')
print(f'F1 Score: {f1:.4f}')

F1 Score: 0.3908


In [22]:
# Define a function to train and evaluate a model
def train_and_evaluate(model_type, best_params, X_train, y_train, X_test, y_test):
    pipeline = get_pipeline(model_type=model_type, **best_params)
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    
    f1 = f1_score(y_test, predictions, average='weighted')
    conf_matrix = confusion_matrix(y_test, predictions)
    
    print(f'{model_type} F1 Score: {f1:.4f}')
    print(f'{model_type} Confusion Matrix:')
    print(conf_matrix)
    
    return f1, conf_matrix

In [23]:
# Train and evaluate XGBoost model
xgb_f1, xgb_conf_matrix = train_and_evaluate(
    model_type="xgboost", 
    best_params=best_params_xgb, 
    X_train=X_train, 
    y_train=y_train_encoded, 
    X_test=X_test, 
    y_test=y_test_encoded
)

# Train and evaluate LightGBM model
lgbm_f1, lgbm_conf_matrix = train_and_evaluate(
    model_type="lightgbm", 
    best_params=best_params_lgbm, 
    X_train=X_train, 
    y_train=y_train_encoded, 
    X_test=X_test, 
    y_test=y_test_encoded
)

xgboost F1 Score: 0.3908
xgboost Confusion Matrix:
[[ 35  15  56]
 [ 29  11  66]
 [ 22  30 115]]
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 353
[LightGBM] [Info] Number of data points in the train set: 1884, number of used features: 12
[LightGBM] [Info] Start training from score -1.285402
[LightGBM] [Info] Start training from score -1.287324
[LightGBM] [Info] Start training from score -0.804185
lightgbm F1 Score: 0.4236
lightgbm Confusion Matrix:
[[ 41  12  53]
 [ 29  12  65]
 [ 23  22 122]]


### Improve the models with feature engineering and hyperparameter tuning

In [24]:
# Include new dummy variables that were left out
data_transformed_new_features = pd.get_dummies(data_transformed, columns=['home', 'away', 'referee', 'venue'])
data_transformed_new_features 


Unnamed: 0,week,date,score,xG,xG_1,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG,home_Alavés,home_Almería,home_Athletic Club,home_Atlético Madrid,home_Barcelona,home_Betis,home_Celta Vigo,home_Cádiz,home_Eibar,home_Elche,home_Espanyol,home_Getafe,home_Girona,home_Granada,home_Huesca,home_Las Palmas,home_Leganés,home_Levante,home_Mallorca,home_Osasuna,home_Rayo Vallecano,home_Real Madrid,home_Real Sociedad,home_Sevilla,home_Valencia,home_Valladolid,home_Villarreal,away_Alavés,away_Almería,away_Athletic Club,away_Atlético Madrid,away_Barcelona,away_Betis,away_Celta Vigo,away_Cádiz,away_Eibar,away_Elche,away_Espanyol,away_Getafe,away_Girona,away_Granada,away_Huesca,away_Las Palmas,away_Leganés,away_Levante,away_Mallorca,away_Osasuna,away_Rayo Vallecano,away_Real Madrid,away_Real Sociedad,away_Sevilla,away_Valencia,away_Valladolid,away_Villarreal,referee_Adrián Cordero,referee_Alberto Undiano,referee_Alejandro Hernández,referee_Alejandro Muñíz,referee_Antonio Matéu,referee_Antonio Matéu Lahoz,referee_Carlos del Cerro,referee_César Soto,referee_Daniel Ask,referee_David Medié,referee_Eduardo Prieto,referee_Francisco Hernández,referee_Guillermo Cuadra,referee_Hsu Jason,referee_Ignacio Iglesias,referee_Isidro Díaz de Mera,referee_Javier Alberola,referee_Javier Villanueva,referee_Jesús Gil,referee_Jorge Figueroa,referee_José González,referee_José Luis Munuera,referee_José Sánchez,referee_Juan Martínez,referee_Juan Pulido,referee_Mario Melero,referee_Mateo Busquets,referee_Miguel Ángel Ortiz Arias,referee_Pablo González,referee_Ricardo de Burgos,referee_Santiago Jaime,referee_Valentín Pizarro,referee_Víctor García,venue_Camp Nou,venue_Coliseum Alfonso Pérez,venue_Estadi Mallorca Son Moix,venue_Estadi Municipal de Montilivi,venue_Estadi Olímpic Lluís Companys,venue_Estadio Abanca Balaídos,venue_Estadio Alfredo Di Stéfano,venue_Estadio Benito Villamarín,venue_Estadio Ciudad de Valencia,venue_Estadio Cívitas Metropolitano,venue_Estadio El Alcoraz,venue_Estadio El Sadar,venue_Estadio Manuel Martínez Valero,venue_Estadio Municipal José Zorrilla,venue_Estadio Municipal de Anoeta,venue_Estadio Municipal de Butarque,venue_Estadio Municipal de Ipurúa,venue_Estadio Nuevo Los Cármenes,venue_Estadio Nuevo Mirandilla,venue_Estadio Ramón Sánchez Pizjuán,venue_Estadio Ramón de Carranza,venue_Estadio San Mamés,venue_Estadio Santiago Bernabéu,venue_Estadio Wanda Metropolitano,venue_Estadio de Balaídos,venue_Estadio de Gran Canaria,venue_Estadio de Mendizorroza,venue_Estadio de Mestalla,venue_Estadio de la Cerámica,venue_Estadio del Rayo Vallecano,venue_Iberostar Estadi,venue_Power Horse Stadium,venue_RCDE Stadium,venue_Reale Arena,venue_San Mamés
0,2,2018-08-24,2–0,1.4,0.6,2.0,0.0,2018,Home win,True,False,False,False,False,False,False,0.0,1.0,0.20,1.30,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2,2018-08-24,2–2,1.7,1.6,2.0,2.0,2018,Draw,True,False,False,False,False,False,False,1.0,2.0,1.00,0.70,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2,2018-08-25,0–0,0.6,0.9,0.0,0.0,2018,Draw,False,False,True,False,False,False,False,0.0,0.0,0.30,0.90,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,2,2018-08-25,1–0,0.9,1.5,1.0,0.0,2018,Home win,False,False,True,False,False,False,False,1.0,1.0,0.90,2.10,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
4,2,2018-08-25,0–1,0.4,1.0,0.0,1.0,2018,Away win,False,False,True,False,False,False,False,0.0,3.0,0.00,3.20,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2258,38,2024-05-25,0–2,0.6,2.2,0.0,2.0,2023,Away win,False,False,True,False,False,False,False,1.0,1.8,0.72,1.16,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2259,38,2024-05-26,1–1,1.0,2.5,1.0,1.0,2023,Draw,False,False,False,True,False,False,False,0.4,1.4,1.08,1.46,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2260,38,2024-05-26,2–2,1.5,2.0,2.0,2.0,2023,Draw,False,False,False,True,False,False,False,1.4,0.6,1.38,1.66,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2261,38,2024-05-26,1–2,0.9,1.4,1.0,2.0,2023,Away win,False,False,False,True,False,False,False,0.6,1.0,1.52,1.08,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [25]:
# Train-test split
train_data = data_transformed_new_features[data_transformed_new_features['season_start'] <= 2022]
test_data = data_transformed_new_features[data_transformed_new_features['season_start'] == 2023]

features = [column for column in data_transformed_new_features.columns if column not in ['date', 'xG', 'xG_1', 'score', 'result', 'home_goals', 'away_goals', 'season_start']]

X_train = train_data[features]
y_train = train_data['result']
X_test = test_data[features]
y_test = test_data['result']

In [26]:
# Encode the labels to use it with Optuna
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [29]:
def objective_tuned(trial, model_type="xgboost"):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    
    if model_type == "xgboost":
        model = xgb.XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss')
    elif model_type == "lightgbm":
        model = lgb.LGBMClassifier(**params)
    
    skf = StratifiedKFold(n_splits=3)
    scores = []
    
    for train_idx, val_idx in skf.split(X_train, y_train_encoded):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train_encoded[train_idx], y_train_encoded[val_idx]
        
        model.fit(X_train_fold, y_train_fold)
        preds = model.predict(X_val_fold)
        score = accuracy_score(y_val_fold, preds)
        scores.append(score)
    
    return np.mean(scores)


In [30]:
# Optimize for XGBoost
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(lambda trial: objective_tuned(trial, model_type="xgboost"), n_trials=5)
best_params_xgb = study_xgb.best_trial.params
print(f'Best parameters for XGBoost: {best_params_xgb}')

# Optimize for LightGBM
study_lgbm = optuna.create_study(direction="maximize")
study_lgbm.optimize(lambda trial: objective_tuned(trial, model_type="lightgbm"), n_trials=5)
best_params_lgbm = study_lgbm.best_trial.params
print(f'Best parameters for LightGBM: {best_params_lgbm}')

[I 2024-08-13 08:19:29,794] A new study created in memory with name: no-name-23bd5cba-d8b5-4abe-9713-ae61c1cb0f04
[I 2024-08-13 08:19:33,547] Trial 0 finished with value: 0.47929936305732485 and parameters: {'n_estimators': 174, 'max_depth': 7, 'learning_rate': 0.041690379656141204, 'subsample': 0.6290019169137278, 'colsample_bytree': 0.8185903898081278}. Best is trial 0 with value: 0.47929936305732485.
[I 2024-08-13 08:19:35,766] Trial 1 finished with value: 0.4479830148619957 and parameters: {'n_estimators': 107, 'max_depth': 7, 'learning_rate': 0.23114461759288082, 'subsample': 0.6585163280720169, 'colsample_bytree': 0.9788502793499718}. Best is trial 0 with value: 0.47929936305732485.
[I 2024-08-13 08:19:38,404] Trial 2 finished with value: 0.4792993630573248 and parameters: {'n_estimators': 83, 'max_depth': 10, 'learning_rate': 0.017657929760233296, 'subsample': 0.9070716938147114, 'colsample_bytree': 0.87507026692653}. Best is trial 0 with value: 0.47929936305732485.
[I 2024-08-1

Best parameters for XGBoost: {'n_estimators': 174, 'max_depth': 7, 'learning_rate': 0.041690379656141204, 'subsample': 0.6290019169137278, 'colsample_bytree': 0.8185903898081278}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 103
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 111
[LightGBM] [Info] S

[I 2024-08-13 08:19:44,770] Trial 0 finished with value: 0.43842887473460723 and parameters: {'n_estimators': 144, 'max_depth': 7, 'learning_rate': 0.2189774471854501, 'subsample': 0.7841555568425067, 'colsample_bytree': 0.5814953233724056}. Best is trial 0 with value: 0.43842887473460723.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 103
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001660 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 111
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 secon

[I 2024-08-13 08:19:45,110] Trial 1 finished with value: 0.482484076433121 and parameters: {'n_estimators': 76, 'max_depth': 4, 'learning_rate': 0.09603487529633754, 'subsample': 0.8198099483773269, 'colsample_bytree': 0.6668403410829029}. Best is trial 1 with value: 0.482484076433121.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002732 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 103
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 111
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185


[I 2024-08-13 08:19:45,501] Trial 2 finished with value: 0.4527600849256901 and parameters: {'n_estimators': 97, 'max_depth': 4, 'learning_rate': 0.1874287469270809, 'subsample': 0.6673232619014557, 'colsample_bytree': 0.7425606490717529}. Best is trial 1 with value: 0.482484076433121.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 499
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 107
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 103
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start tr

[I 2024-08-13 08:19:45,959] Trial 3 finished with value: 0.47399150743099794 and parameters: {'n_estimators': 70, 'max_depth': 9, 'learning_rate': 0.07005093568168494, 'subsample': 0.686229080439396, 'colsample_bytree': 0.5707753818662827}. Best is trial 1 with value: 0.482484076433121.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 499
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 107
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001366 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 103
[LightGBM] [Info] Start training from score -1.283485
[LightGBM] [Info] Start training from score -1.289249
[LightGBM] [Info] Start training from score -0.804185
[LightGBM] [Info] Auto-choosing col-

[I 2024-08-13 08:19:46,267] Trial 4 finished with value: 0.48779193205944793 and parameters: {'n_estimators': 107, 'max_depth': 3, 'learning_rate': 0.07220524680758891, 'subsample': 0.5693190585825629, 'colsample_bytree': 0.5538370214909929}. Best is trial 4 with value: 0.48779193205944793.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 499
[LightGBM] [Info] Number of data points in the train set: 1256, number of used features: 107
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -1.286363
[LightGBM] [Info] Start training from score -0.804185
Best parameters for LightGBM: {'n_estimators': 107, 'max_depth': 3, 'learning_rate': 0.07220524680758891, 'subsample': 0.5693190585825629, 'colsample_bytree': 0.5538370214909929}


In [32]:
# Train and evaluate XGBoost model with best parameters
pipeline_xgb = xgb.XGBClassifier(**best_params_xgb)
pipeline_xgb.fit(X_train, y_train_encoded)
predictions_xgb = pipeline_xgb.predict(X_test)
accuracy_xgb = accuracy_score(y_test_encoded, predictions_xgb)
conf_matrix_xgb = confusion_matrix(y_test_encoded, predictions_xgb)
print(f'XGBoost Accuracy: {accuracy_xgb:.4f}')
print('XGBoost Confusion Matrix:')
print(conf_matrix_xgb)

XGBoost Accuracy: 0.4855
XGBoost Confusion Matrix:
[[ 37  22  47]
 [ 23  19  64]
 [ 21  18 128]]


In [59]:
predictions_xgb 

array([2, 1, 0, 1, 1, 2, 0, 2, 2, 1, 1, 0, 1, 0, 1, 2, 0, 2, 1, 0, 0, 2,
       2, 1, 0, 1, 2, 2, 0, 1, 2, 2, 2, 0, 2, 1, 0, 0, 2, 0, 0, 0, 2, 2,
       0, 2, 2, 1, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 1, 2, 1, 2,
       2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 0, 1, 1, 0,
       1, 0, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 1, 2, 0, 2, 2, 0, 0, 2,
       2, 0, 0, 1, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2,
       0, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 0, 1, 2, 2,
       1, 2, 1, 1, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2,
       0, 2, 2, 2, 2, 1, 2, 2, 0, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 1, 2, 2,
       2, 1, 2, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2,
       2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 0, 2, 2, 1, 2, 2,
       0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 0, 2, 2, 2,
       2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 2, 1, 2, 2, 0,

In [60]:
# Turn preds into original format
predictions_xgb_decoded = label_encoder.inverse_transform(predictions_xgb)

In [61]:
predictions_xgb_decoded 

array(['Home win', 'Draw', 'Away win', 'Draw', 'Draw', 'Home win',
       'Away win', 'Home win', 'Home win', 'Draw', 'Draw', 'Away win',
       'Draw', 'Away win', 'Draw', 'Home win', 'Away win', 'Home win',
       'Draw', 'Away win', 'Away win', 'Home win', 'Home win', 'Draw',
       'Away win', 'Draw', 'Home win', 'Home win', 'Away win', 'Draw',
       'Home win', 'Home win', 'Home win', 'Away win', 'Home win', 'Draw',
       'Away win', 'Away win', 'Home win', 'Away win', 'Away win',
       'Away win', 'Home win', 'Home win', 'Away win', 'Home win',
       'Home win', 'Draw', 'Draw', 'Home win', 'Away win', 'Home win',
       'Home win', 'Home win', 'Home win', 'Home win', 'Home win',
       'Home win', 'Home win', 'Away win', 'Draw', 'Home win', 'Draw',
       'Home win', 'Draw', 'Home win', 'Home win', 'Away win', 'Home win',
       'Home win', 'Draw', 'Home win', 'Home win', 'Home win', 'Home win',
       'Home win', 'Home win', 'Home win', 'Draw', 'Home win', 'Draw',
       'Ho

In [62]:
# Add predictions to test data
test_data['predictions'] = predictions_xgb_decoded

In [64]:
test_data[['result', 'predictions']]

Unnamed: 0,result,predictions
1884,Away win,Home win
1885,Away win,Draw
1886,Away win,Away win
1887,Draw,Draw
1888,Away win,Draw
...,...,...
2258,Away win,Draw
2259,Draw,Home win
2260,Draw,Away win
2261,Away win,Home win


In [68]:
from sklearn.metrics import classification_report

# Generar un reporte de clasificación
report = classification_report(y_test_encoded, predictions_xgb, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

    Away win       0.46      0.35      0.40       106
        Draw       0.32      0.18      0.23       106
    Home win       0.54      0.77      0.63       167

    accuracy                           0.49       379
   macro avg       0.44      0.43      0.42       379
weighted avg       0.45      0.49      0.45       379



In [65]:
import plotly.figure_factory as ff

def plot_confusion_matrix(conf_matrix, model_name):
    # Normalize the confusion matrix
    conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
    
    # Define the plotly heatmap
    z = conf_matrix_normalized
    x = ['Predicted: Home win', 'Predicted: Draw', 'Predicted: Away win']
    y = ['Actual: Home win', 'Actual: Draw', 'Actual: Away win']

    fig = ff.create_annotated_heatmap(z, x=x, y=y, colorscale='Blues', showscale=True)

    fig.update_layout(
        title=f'Confusion Matrix for {model_name}',
        xaxis_title="Predicted",
        yaxis_title="Actual"
    )

    fig.show();

In [66]:
plot_confusion_matrix(conf_matrix_xgb, "XGBoost")

#### Conclusions drawn from the result of the model

- Correctly predicts most away wins but struggles more with draws and home wins.

- Bias: towards predicting away wins.

- Confusion: the model confuses draws with away wins and also shows some confusion between home wins and draws.

- Further improvements: more and better features with tuning of hyperparameters to improve its accuracy.

#### Test with a new hypothetical match

In [36]:
data_transformed['referee'].unique()

array(['David Medié', 'José Luis Munuera', 'Pablo González',
       'José González', 'Ricardo de Burgos', 'Juan Martínez',
       'Alberto Undiano', 'Carlos del Cerro', 'Alejandro Hernández',
       'Hsu Jason', 'Adrián Cordero', 'Eduardo Prieto', 'José Sánchez',
       'Antonio Matéu Lahoz', 'Santiago Jaime', 'Ignacio Iglesias',
       'Jesús Gil', 'Daniel Ask', 'Mario Melero', 'Guillermo Cuadra',
       'César Soto', 'Valentín Pizarro', 'Jorge Figueroa',
       'Isidro Díaz de Mera', 'Miguel Ángel Ortiz Arias',
       'Alejandro Muñíz', 'Javier Villanueva', 'Juan Pulido',
       'Antonio Matéu', 'Javier Alberola', 'Francisco Hernández',
       'Víctor García', 'Mateo Busquets'], dtype=object)

In [43]:
data_transformed['venue'].unique()

array(['Coliseum Alfonso Pérez', 'Estadio Municipal de Butarque',
       'Estadio de Mendizorroza', 'Estadio Wanda Metropolitano',
       'Estadio Municipal José Zorrilla', 'Estadi Municipal de Montilivi',
       'Estadio Ramón Sánchez Pizjuán', 'RCDE Stadium',
       'Estadio Ciudad de Valencia', 'San Mamés',
       'Estadio de la Cerámica', 'Estadio Municipal de Ipurúa',
       'Estadio de Balaídos', 'Estadio Santiago Bernabéu',
       'Estadio Benito Villamarín', 'Camp Nou', 'Estadio El Alcoraz',
       'Estadio de Mestalla', 'Estadio Municipal de Anoeta',
       'Estadio San Mamés', 'Estadio del Rayo Vallecano',
       'Estadio Nuevo Los Cármenes', 'Estadio El Sadar',
       'Iberostar Estadi', 'Estadio Alfredo Di Stéfano',
       'Estadio Ramón de Carranza', 'Estadio Manuel Martínez Valero',
       'Estadio Nuevo Mirandilla', 'Estadio Cívitas Metropolitano',
       'Reale Arena', 'Power Horse Stadium', 'Estadio Abanca Balaídos',
       'Estadi Mallorca Son Moix', 'Estadi Olímpic L

In [44]:
home_team = 'Real Madrid'
away_team = 'Getafe'
venue = 'Estadio Santiago Bernabéu'
referee = 'Jesús Gil'

In [53]:
# Create a row of data for the hypothetical match
data = {
    'Wk': [25],
    'home_rolling_avg_goals': [1.1],
    'away_rolling_avg_goals': [1.9],
    'home_rolling_avg_xG': [1.3],
    'away_rolling_avg_xG': [1.9],
    'Day_Saturday': [1],
    'Home_Madrid': [1],
    'Away_Gegate': [1],
    'Referee_Gil': [1],
    'Venue_Coliseum': [1]
}

match = pd.DataFrame(columns=X_train.columns, data=data)

match.fillna(0, inplace=True)

In [54]:
new_prediction = pipeline_xgb.predict(match)
new_prediction

array([0])

In [55]:
# Print them
print(f"Encoded classes: {label_encoder.classes_}")

# Two
print(f"Corresponding class por prediction: {label_encoder.inverse_transform([2])}")

Encoded classes: ['Away win' 'Draw' 'Home win']
Corresponding class por prediction: ['Home win']


#### Real results vs. predicted results

In [48]:
data_transformed[data_transformed['home'] == 'Real Madrid'] 

Unnamed: 0,week,date,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG
14,3,2018-09-01,Real Madrid,4–1,Leganés,2.5,1.1,Estadio Santiago Bernabéu,Santiago Jaime,4.0,1.0,2018,Home win,False,False,True,False,False,False,False,3.00,1.50,2.45,1.350
34,5,2018-09-22,Real Madrid,1–0,Espanyol,1.0,0.4,Estadio Santiago Bernabéu,Antonio Matéu Lahoz,1.0,0.0,2018,Home win,False,False,True,False,False,False,False,2.75,1.25,2.30,1.175
50,7,2018-09-29,Real Madrid,0–0,Atlético Madrid,1.2,0.8,Estadio Santiago Bernabéu,Juan Martínez,0.0,0.0,2018,Draw,False,False,True,False,False,False,False,2.00,1.40,2.14,1.120
70,9,2018-10-20,Real Madrid,1–2,Levante,2.9,2.0,Estadio Santiago Bernabéu,Guillermo Cuadra,1.0,2.0,2018,Away win,False,False,True,False,False,False,False,0.40,1.20,1.26,1.300
91,11,2018-11-03,Real Madrid,2–0,Valladolid,2.2,0.9,Estadio Santiago Bernabéu,Jesús Gil,2.0,0.0,2018,Home win,False,False,True,False,False,False,False,0.40,1.20,1.70,0.820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2180,30,2024-03-31,Real Madrid,2–0,Athletic Club,1.1,0.3,Estadio Santiago Bernabéu,Javier Alberola,2.0,0.0,2023,Home win,False,False,False,True,False,False,False,2.40,1.60,2.12,1.040
2201,32,2024-04-21,Real Madrid,3–2,Barcelona,2.3,1.4,Estadio Santiago Bernabéu,César Soto,3.0,2.0,2023,Home win,False,False,False,True,False,False,False,2.60,1.20,2.22,1.360
2215,34,2024-05-04,Real Madrid,3–0,Cádiz,2.3,0.9,Estadio Santiago Bernabéu,Javier Villanueva,3.0,0.0,2023,Home win,False,False,True,False,False,False,False,2.20,0.60,1.64,1.080
2234,36,2024-05-14,Real Madrid,5–0,Alavés,2.5,1.3,Estadio Santiago Bernabéu,Mateo Busquets,5.0,0.0,2023,Home win,False,False,False,False,False,True,False,2.40,1.60,1.60,1.100


In [49]:
test_data[(test_data['home_Real Madrid'] == True) & (test_data['venue_Estadio Santiago Bernabéu'] == True)]

Unnamed: 0,week,date,score,xG,xG_1,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG,home_Alavés,home_Almería,home_Athletic Club,home_Atlético Madrid,home_Barcelona,home_Betis,home_Celta Vigo,home_Cádiz,home_Eibar,home_Elche,home_Espanyol,home_Getafe,home_Girona,home_Granada,home_Huesca,home_Las Palmas,home_Leganés,home_Levante,home_Mallorca,home_Osasuna,home_Rayo Vallecano,home_Real Madrid,home_Real Sociedad,home_Sevilla,home_Valencia,home_Valladolid,home_Villarreal,away_Alavés,away_Almería,away_Athletic Club,away_Atlético Madrid,away_Barcelona,away_Betis,away_Celta Vigo,away_Cádiz,away_Eibar,away_Elche,away_Espanyol,away_Getafe,away_Girona,away_Granada,away_Huesca,away_Las Palmas,away_Leganés,away_Levante,away_Mallorca,away_Osasuna,away_Rayo Vallecano,away_Real Madrid,away_Real Sociedad,away_Sevilla,away_Valencia,away_Valladolid,away_Villarreal,referee_Adrián Cordero,referee_Alberto Undiano,referee_Alejandro Hernández,referee_Alejandro Muñíz,referee_Antonio Matéu,referee_Antonio Matéu Lahoz,referee_Carlos del Cerro,referee_César Soto,referee_Daniel Ask,referee_David Medié,referee_Eduardo Prieto,referee_Francisco Hernández,referee_Guillermo Cuadra,referee_Hsu Jason,referee_Ignacio Iglesias,referee_Isidro Díaz de Mera,referee_Javier Alberola,referee_Javier Villanueva,referee_Jesús Gil,referee_Jorge Figueroa,referee_José González,referee_José Luis Munuera,referee_José Sánchez,referee_Juan Martínez,referee_Juan Pulido,referee_Mario Melero,referee_Mateo Busquets,referee_Miguel Ángel Ortiz Arias,referee_Pablo González,referee_Ricardo de Burgos,referee_Santiago Jaime,referee_Valentín Pizarro,referee_Víctor García,venue_Camp Nou,venue_Coliseum Alfonso Pérez,venue_Estadi Mallorca Son Moix,venue_Estadi Municipal de Montilivi,venue_Estadi Olímpic Lluís Companys,venue_Estadio Abanca Balaídos,venue_Estadio Alfredo Di Stéfano,venue_Estadio Benito Villamarín,venue_Estadio Ciudad de Valencia,venue_Estadio Cívitas Metropolitano,venue_Estadio El Alcoraz,venue_Estadio El Sadar,venue_Estadio Manuel Martínez Valero,venue_Estadio Municipal José Zorrilla,venue_Estadio Municipal de Anoeta,venue_Estadio Municipal de Butarque,venue_Estadio Municipal de Ipurúa,venue_Estadio Nuevo Los Cármenes,venue_Estadio Nuevo Mirandilla,venue_Estadio Ramón Sánchez Pizjuán,venue_Estadio Ramón de Carranza,venue_Estadio San Mamés,venue_Estadio Santiago Bernabéu,venue_Estadio Wanda Metropolitano,venue_Estadio de Balaídos,venue_Estadio de Gran Canaria,venue_Estadio de Mendizorroza,venue_Estadio de Mestalla,venue_Estadio de la Cerámica,venue_Estadio del Rayo Vallecano,venue_Iberostar Estadi,venue_Power Horse Stadium,venue_RCDE Stadium,venue_Reale Arena,venue_San Mamés
1916,4,2023-09-02,2–1,2.8,0.4,2.0,1.0,2023,Home win,False,False,True,False,False,False,False,1.8,0.6,1.38,1.3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1930,5,2023-09-17,2–1,2.0,1.6,2.0,1.0,2023,Home win,False,False,False,True,False,False,False,1.8,1.8,1.78,1.36,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1945,7,2023-09-27,2–0,1.7,0.7,2.0,0.0,2023,Home win,False,False,False,False,False,False,True,1.8,0.2,1.84,0.82,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1965,9,2023-10-07,4–0,3.2,0.5,4.0,0.0,2023,Home win,False,False,True,False,False,False,False,2.0,1.0,1.96,1.08,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1998,12,2023-11-05,0–0,2.2,0.1,0.0,0.0,2023,Draw,False,False,False,True,False,False,False,2.4,1.4,1.98,1.2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
2003,13,2023-11-11,5–1,1.7,1.7,5.0,1.0,2023,Home win,False,False,True,False,False,False,False,2.0,1.2,2.08,1.16,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
2025,15,2023-12-02,2–0,1.8,0.1,2.0,0.0,2023,Home win,False,False,True,False,False,False,False,2.2,0.8,1.64,1.2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
2050,17,2023-12-17,4–1,2.3,0.8,4.0,1.0,2023,Home win,False,False,False,True,False,False,False,2.2,1.4,1.66,1.74,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
2067,19,2024-01-03,1–0,1.7,0.6,1.0,0.0,2023,Home win,False,False,False,False,False,False,True,2.2,1.0,1.46,1.08,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
2086,21,2024-01-21,3–2,2.2,0.9,3.0,2.0,2023,Home win,False,False,False,True,False,False,False,1.8,0.6,1.48,1.02,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
