## Notebook 03 - Feature Engineering

In [6]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import sys
sys.path.append('../src')
from paths import TRANSFORMED_DATA_DIR

#Automcomplete
%config IPCompleter.greedy=True

In [7]:
# Load the cleaned dataset
data_quality = pd.read_csv(TRANSFORMED_DATA_DIR / 'data_quality.csv')

In [8]:
data_quality

Unnamed: 0,week,date,day,home,score,away,xG,xG_1,venue,referee
0,1,2018-08-17,Fri,Girona,0–0,Valladolid,0.7,0.0,Estadi Municipal de Montilivi,Guillermo Cuadra
1,1,2018-08-17,Fri,Betis,0–3,Levante,0.9,2.5,Estadio Benito Villamarín,Ignacio Iglesias
2,1,2018-08-18,Sat,Celta Vigo,1–1,Espanyol,0.6,0.6,Estadio de Balaídos,Santiago Jaime
3,1,2018-08-18,Sat,Villarreal,1–2,Real Sociedad,1.6,0.7,Estadio de la Cerámica,Mario Melero
4,1,2018-08-18,Sat,Barcelona,3–0,Alavés,3.2,0.3,Camp Nou,José Sánchez
...,...,...,...,...,...,...,...,...,...,...
2275,38,2024-05-25,Sat,Real Madrid,0–0,Betis,1.0,0.7,Estadio Santiago Bernabéu,Isidro Díaz de Mera
2276,38,2024-05-26,Sun,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García
2277,38,2024-05-26,Sun,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández
2278,38,2024-05-26,Sun,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias


### Create features and the target variable

The objective of the training is to predict match results, either Home wins or Away wins, so in this section we'll be creating the features and target variables needed to build and train the model prediction dataset.

An important note: I decided to train the model on the seasons before de 2023/2024 and validate it with this last season, since my initial idea is to predict the matches for the upcoming La Liga 2024/2025 season, though since there's no official match calendar and, at the time I'm doing this project, the Segunda División Playoffs are not done, so we don't know which will be the teams that are going to ascend.

### Feature engineering and data transformation

In [10]:
# Split the 'Score' column into 'HomeGoals' and 'AwayGoals' columns
data_quality[['home_goals', 'away_goals']] = data_quality['score'].str.split('–', expand=True).astype(float)

In [11]:
data_quality

Unnamed: 0,week,date,day,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals
0,1,2018-08-17,Fri,Girona,0–0,Valladolid,0.7,0.0,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0
1,1,2018-08-17,Fri,Betis,0–3,Levante,0.9,2.5,Estadio Benito Villamarín,Ignacio Iglesias,0.0,3.0
2,1,2018-08-18,Sat,Celta Vigo,1–1,Espanyol,0.6,0.6,Estadio de Balaídos,Santiago Jaime,1.0,1.0
3,1,2018-08-18,Sat,Villarreal,1–2,Real Sociedad,1.6,0.7,Estadio de la Cerámica,Mario Melero,1.0,2.0
4,1,2018-08-18,Sat,Barcelona,3–0,Alavés,3.2,0.3,Camp Nou,José Sánchez,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2275,38,2024-05-25,Sat,Real Madrid,0–0,Betis,1.0,0.7,Estadio Santiago Bernabéu,Isidro Díaz de Mera,0.0,0.0
2276,38,2024-05-26,Sun,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García,1.0,2.0
2277,38,2024-05-26,Sun,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández,1.0,1.0
2278,38,2024-05-26,Sun,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,2.0,2.0


In [13]:
# Create a season_start column so we can later train the 
data_quality['date'] = pd.to_datetime(data_quality['date'])
data_quality['season_start'] = data_quality['date'].apply(lambda x: x.year - 1 if x.month < 8 else x.year)

In [14]:
data_quality

Unnamed: 0,week,date,day,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start
0,1,2018-08-17,Fri,Girona,0–0,Valladolid,0.7,0.0,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0,2018
1,1,2018-08-17,Fri,Betis,0–3,Levante,0.9,2.5,Estadio Benito Villamarín,Ignacio Iglesias,0.0,3.0,2018
2,1,2018-08-18,Sat,Celta Vigo,1–1,Espanyol,0.6,0.6,Estadio de Balaídos,Santiago Jaime,1.0,1.0,2018
3,1,2018-08-18,Sat,Villarreal,1–2,Real Sociedad,1.6,0.7,Estadio de la Cerámica,Mario Melero,1.0,2.0,2018
4,1,2018-08-18,Sat,Barcelona,3–0,Alavés,3.2,0.3,Camp Nou,José Sánchez,3.0,0.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2275,38,2024-05-25,Sat,Real Madrid,0–0,Betis,1.0,0.7,Estadio Santiago Bernabéu,Isidro Díaz de Mera,0.0,0.0,2023
2276,38,2024-05-26,Sun,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García,1.0,2.0,2023
2277,38,2024-05-26,Sun,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández,1.0,1.0,2023
2278,38,2024-05-26,Sun,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,2.0,2.0,2023


In [15]:
# Create a target variable for the match result
def determine_result(row):
    if row['home_goals'] > row['away_goals']:
        return 'Home win'
    elif row['home_goals'] < row['away_goals']:
        return 'Away win'
    else:
        return 'Draw'

data_quality['result'] = data_quality.apply(determine_result, axis=1)

In [16]:
data_quality

Unnamed: 0,week,date,day,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result
0,1,2018-08-17,Fri,Girona,0–0,Valladolid,0.7,0.0,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0,2018,Draw
1,1,2018-08-17,Fri,Betis,0–3,Levante,0.9,2.5,Estadio Benito Villamarín,Ignacio Iglesias,0.0,3.0,2018,Away win
2,1,2018-08-18,Sat,Celta Vigo,1–1,Espanyol,0.6,0.6,Estadio de Balaídos,Santiago Jaime,1.0,1.0,2018,Draw
3,1,2018-08-18,Sat,Villarreal,1–2,Real Sociedad,1.6,0.7,Estadio de la Cerámica,Mario Melero,1.0,2.0,2018,Away win
4,1,2018-08-18,Sat,Barcelona,3–0,Alavés,3.2,0.3,Camp Nou,José Sánchez,3.0,0.0,2018,Home win
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2275,38,2024-05-25,Sat,Real Madrid,0–0,Betis,1.0,0.7,Estadio Santiago Bernabéu,Isidro Díaz de Mera,0.0,0.0,2023,Draw
2276,38,2024-05-26,Sun,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García,1.0,2.0,2023,Away win
2277,38,2024-05-26,Sun,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández,1.0,1.0,2023,Draw
2278,38,2024-05-26,Sun,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,2.0,2.0,2023,Draw


#### One Hot Encoding

In [17]:
# Encode 'day' as a categorical feature using pd.get_dummies
data_quality['day'] = data_quality['date'].dt.day_name()
data_quality = pd.get_dummies(data_quality, columns=['day'])

# Display the updated dataset with new features
data_quality.head()

Unnamed: 0,week,date,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,1,2018-08-17,Girona,0–0,Valladolid,0.7,0.0,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0,2018,Draw,True,False,False,False,False,False,False
1,1,2018-08-17,Betis,0–3,Levante,0.9,2.5,Estadio Benito Villamarín,Ignacio Iglesias,0.0,3.0,2018,Away win,True,False,False,False,False,False,False
2,1,2018-08-18,Celta Vigo,1–1,Espanyol,0.6,0.6,Estadio de Balaídos,Santiago Jaime,1.0,1.0,2018,Draw,False,False,True,False,False,False,False
3,1,2018-08-18,Villarreal,1–2,Real Sociedad,1.6,0.7,Estadio de la Cerámica,Mario Melero,1.0,2.0,2018,Away win,False,False,True,False,False,False,False
4,1,2018-08-18,Barcelona,3–0,Alavés,3.2,0.3,Camp Nou,José Sánchez,3.0,0.0,2018,Home win,False,False,True,False,False,False,False


### Creating new features

In [19]:
# Let's create some new features
data_quality.reset_index(drop=True, inplace=True)
data_quality.sort_values(['date'], inplace=True)

In [20]:
data_quality

Unnamed: 0,week,date,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,1,2018-08-17,Girona,0–0,Valladolid,0.7,0.0,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0,2018,Draw,True,False,False,False,False,False,False
1,1,2018-08-17,Betis,0–3,Levante,0.9,2.5,Estadio Benito Villamarín,Ignacio Iglesias,0.0,3.0,2018,Away win,True,False,False,False,False,False,False
2,1,2018-08-18,Celta Vigo,1–1,Espanyol,0.6,0.6,Estadio de Balaídos,Santiago Jaime,1.0,1.0,2018,Draw,False,False,True,False,False,False,False
3,1,2018-08-18,Villarreal,1–2,Real Sociedad,1.6,0.7,Estadio de la Cerámica,Mario Melero,1.0,2.0,2018,Away win,False,False,True,False,False,False,False
4,1,2018-08-18,Barcelona,3–0,Alavés,3.2,0.3,Camp Nou,José Sánchez,3.0,0.0,2018,Home win,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2272,38,2024-05-25,Real Sociedad,0–2,Atlético Madrid,0.6,2.2,Reale Arena,José Sánchez,0.0,2.0,2023,Away win,False,False,True,False,False,False,False
2277,38,2024-05-26,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández,1.0,1.0,2023,Draw,False,False,False,True,False,False,False
2278,38,2024-05-26,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,2.0,2.0,2023,Draw,False,False,False,True,False,False,False
2276,38,2024-05-26,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García,1.0,2.0,2023,Away win,False,False,False,True,False,False,False


### Rolling averages

Since we're predicting match outcomes, this is a feature that'll help us achieve that. These are averages calculated over a moving window of data points, helping to smooth out short-term variations and reveal longer-term trends.

In [25]:
# First, let's isolate teams matches, so we can then calculate the rolling averages properly
for x in data_quality.home.unique():
    temp_df = data_quality[(data_quality['home'] == x) | (data_quality['away'] == x)]
    break

In [26]:
temp_df

Unnamed: 0,week,date,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday
0,1,2018-08-17,Girona,0–0,Valladolid,0.7,0.0,Estadi Municipal de Montilivi,Guillermo Cuadra,0.0,0.0,2018,Draw,True,False,False,False,False,False,False
16,2,2018-08-26,Girona,1–4,Real Madrid,0.4,3.9,Estadi Municipal de Montilivi,Juan Martínez,1.0,4.0,2018,Away win,False,False,False,True,False,False,False
21,3,2018-08-31,Villarreal,0–1,Girona,1.1,1.0,Estadio de la Cerámica,Eduardo Prieto,0.0,1.0,2018,Away win,True,False,False,False,False,False,False
38,4,2018-09-17,Girona,3–2,Celta Vigo,1.7,1.0,Estadi Municipal de Montilivi,Hsu Jason,3.0,2.0,2018,Home win,False,True,False,False,False,False,False
48,5,2018-09-23,Barcelona,2–2,Girona,1.4,1.5,Camp Nou,Jesús Gil,2.0,2.0,2018,Draw,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2233,34,2024-05-04,Girona,4–2,Barcelona,1.8,2.6,Estadi Municipal de Montilivi,Alejandro Hernández,4.0,2.0,2023,Home win,False,False,True,False,False,False,False
2240,35,2024-05-10,Alavés,2–2,Girona,1.7,1.7,Estadio de Mendizorroza,José Luis Munuera,2.0,2.0,2023,Draw,True,False,False,False,False,False,False
2252,36,2024-05-14,Girona,0–1,Villarreal,1.6,0.6,Estadi Municipal de Montilivi,José Sánchez,0.0,1.0,2023,Away win,False,False,False,False,False,True,False
2267,37,2024-05-19,Valencia,1–3,Girona,3.2,2.6,Estadio de Mestalla,Guillermo Cuadra,1.0,3.0,2023,Away win,False,False,False,True,False,False,False


In [29]:
# Update the rolling average goals for the home team in the main dataset (data_quality)
for x in data_quality.home.unique():
    temp_df = data_quality[(data_quality['home'] == x) | (data_quality['away'] == x)]
    temp_df = temp_df.sort_values(['date'])
    
    temp_df['goal_value_to_calculate'] = temp_df.apply(lambda y: y['home_goals'] if y['home'] == x else y['away_goals'], axis=1)
    temp_df['rolling_avg_goals'] = temp_df['goal_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    
    for index, row in temp_df.iterrows():
        if row['home'] == x:
            data_quality.at[index, 'home_rolling_avg_goals'] = row['rolling_avg_goals']
        else:
            data_quality.at[index, 'away_rolling_avg_goals'] = row['rolling_avg_goals']

In [31]:
data_quality[(data_quality['home'] == 'Real Madrid') | (data_quality['away'] == 'Real Madrid')][['week', 'date', 'home', 'away', 'home_goals', 'away_goals','home_rolling_avg_goals', 'away_rolling_avg_goals']]

Unnamed: 0,week,date,home,away,home_goals,away_goals,home_rolling_avg_goals,away_rolling_avg_goals
7,1,2018-08-19,Real Madrid,Getafe,2.0,0.0,,
16,2,2018-08-26,Girona,Real Madrid,1.0,4.0,0.00,2.000000
24,3,2018-09-01,Real Madrid,Leganés,4.0,1.0,3.00,1.500000
33,4,2018-09-15,Athletic Club,Real Madrid,1.0,1.0,2.00,3.333333
44,5,2018-09-22,Real Madrid,Espanyol,1.0,0.0,2.75,1.250000
...,...,...,...,...,...,...,...,...
2232,34,2024-05-04,Real Madrid,Cádiz,3.0,0.0,2.20,0.600000
2243,35,2024-05-11,Granada,Real Madrid,0.0,4.0,1.20,2.000000
2251,36,2024-05-14,Real Madrid,Alavés,5.0,0.0,2.40,1.600000
2269,37,2024-05-19,Villarreal,Real Madrid,4.0,4.0,2.20,3.200000


In [33]:
# Do the same thing for xG
for x in data_quality.home.unique():
    temp_df = data_quality[(data_quality['home'] == x) | (data_quality['away'] == x)]
    temp_df = temp_df.sort_values(['date'])
    
    temp_df['xG_value_to_calculate'] = temp_df.apply(lambda y: y['xG'] if y['home'] == x else y['xG_1'], axis=1)
    temp_df['rolling_avg_xG'] = temp_df['xG_value_to_calculate'].rolling(window=5, closed="left", min_periods=1).mean()
    
    for index, row in temp_df.iterrows():
        if row['home'] == x:
            data_quality.at[index, 'home_rolling_avg_xG'] = row['rolling_avg_xG']
        else:
            data_quality.at[index, 'away_rolling_avg_xG'] = row['rolling_avg_xG']

In [36]:
# One last bit of clean up is to drop the rows where the rolling averages are null
data_quality = data_quality.dropna(subset=['home_rolling_avg_goals', 'away_rolling_avg_goals', 'home_rolling_avg_xG', 'away_rolling_avg_xG'])

In [38]:
data_quality[(data_quality['home'] == 'Real Madrid') | (data_quality['away'] == 'Real Madrid')][['date', 'home', 'away', 'xG', 'xG_1','home_rolling_avg_xG', 'away_rolling_avg_xG']]

Unnamed: 0,date,home,away,xG,xG_1,home_rolling_avg_xG,away_rolling_avg_xG
16,2018-08-26,Girona,Real Madrid,0.4,3.9,0.70,1.000000
24,2018-09-01,Real Madrid,Leganés,2.5,1.1,2.45,1.350000
33,2018-09-15,Athletic Club,Real Madrid,1.4,1.8,1.40,2.466667
44,2018-09-22,Real Madrid,Espanyol,1.0,0.4,2.30,1.175000
54,2018-09-26,Sevilla,Real Madrid,2.4,1.5,1.94,2.040000
...,...,...,...,...,...,...,...
2232,2024-05-04,Real Madrid,Cádiz,2.3,0.9,1.64,1.080000
2243,2024-05-11,Granada,Real Madrid,1.0,1.5,1.02,1.520000
2251,2024-05-14,Real Madrid,Alavés,2.5,1.3,1.60,1.100000
2269,2024-05-19,Villarreal,Real Madrid,1.7,1.0,1.46,1.800000


In [40]:
pd.set_option('display.max_columns', None)
data_quality

Unnamed: 0,week,date,home,score,away,xG,xG_1,venue,referee,home_goals,away_goals,season_start,result,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,home_rolling_avg_goals,away_rolling_avg_goals,home_rolling_avg_xG,away_rolling_avg_xG
10,2,2018-08-24,Getafe,2–0,Eibar,1.4,0.6,Coliseum Alfonso Pérez,David Medié,2.0,0.0,2018,Home win,True,False,False,False,False,False,False,0.0,1.0,0.20,1.30
11,2,2018-08-24,Leganés,2–2,Real Sociedad,1.7,1.6,Estadio Municipal de Butarque,José Luis Munuera,2.0,2.0,2018,Draw,True,False,False,False,False,False,False,1.0,2.0,1.00,0.70
12,2,2018-08-25,Alavés,0–0,Betis,0.6,0.9,Estadio de Mendizorroza,Pablo González,0.0,0.0,2018,Draw,False,False,True,False,False,False,False,0.0,0.0,0.30,0.90
13,2,2018-08-25,Atlético Madrid,1–0,Rayo Vallecano,0.9,1.5,Estadio Wanda Metropolitano,José González,1.0,0.0,2018,Home win,False,False,True,False,False,False,False,1.0,1.0,0.90,2.10
14,2,2018-08-25,Valladolid,0–1,Barcelona,0.4,1.0,Estadio Municipal José Zorrilla,Ricardo de Burgos,0.0,1.0,2018,Away win,False,False,True,False,False,False,False,0.0,3.0,0.00,3.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2272,38,2024-05-25,Real Sociedad,0–2,Atlético Madrid,0.6,2.2,Reale Arena,José Sánchez,0.0,2.0,2023,Away win,False,False,True,False,False,False,False,1.0,1.8,0.72,1.16
2277,38,2024-05-26,Las Palmas,1–1,Alavés,1.0,2.5,Estadio de Gran Canaria,Francisco Hernández,1.0,1.0,2023,Draw,False,False,False,True,False,False,False,0.4,1.4,1.08,1.46
2278,38,2024-05-26,Celta Vigo,2–2,Valencia,1.5,2.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,2.0,2.0,2023,Draw,False,False,False,True,False,False,False,1.4,0.6,1.38,1.66
2276,38,2024-05-26,Getafe,1–2,Mallorca,0.9,1.4,Coliseum Alfonso Pérez,Víctor García,1.0,2.0,2023,Away win,False,False,False,True,False,False,False,0.6,1.0,1.52,1.08
