# Training the model

Imports.

In [94]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

Pathing to the data.

In [95]:
data_path = '../data'
clean_data_path = f'{data_path}/csv/clean'

Open cleaned data.

In [96]:
matches_df = pd.read_csv(f'{clean_data_path}/bundesliga_matches.csv')

Check types of columns to make sure they are numeric.

In [97]:
matches_df.dtypes

date                       object
time                       object
comp                       object
round                      object
day                        object
                           ...   
misc_aerial_duels_won     float64
misc_aerial_duels_lost    float64
misc_aerial_duels_won%    float64
pgf                       float64
pga                       float64
Length: 159, dtype: object

Convert `date` column to datetime.

In [98]:
matches_df['date'] = pd.to_datetime(matches_df['date'])
matches_df['date'].dtypes

dtype('<M8[ns]')

Convert `opponent` column to categorical.

In [99]:
matches_df['opponent_code'] = matches_df['opponent'].astype('category').cat.codes
matches_df[['opponent', 'opponent_code']].drop_duplicates()

Unnamed: 0,opponent,opponent_code
0,Arminia,0
1,Hertha BSC,12
2,Hoffenheim,13
3,Koln,14
4,Bayern Munich,3
5,Werder Bremen,23
6,Stuttgart,21
7,RB Leipzig,19
8,Union Berlin,22
9,Dortmund,5


Create a method to compute rolling averages. We will split the matches dataframe by team, because what we want to do is compute rolling averages for each team - how did this team perform in the past few games?

In [100]:
def rolling_averages(data, cols, new_cols):
    """
    Compute rolling averages for the specified columns.
    :param data: the dataframe to use.
    :param cols: the columns to compute rolling averages for.
    :param new_cols: the names of the new columns that will contain the rolling averages.
    :return: the dataframe with the new columns added.
    """
    # Sort by date because we want to look at the last 3 matches.
    data = data.sort_values('date')

    # Compute rolling averages for the specified columns.
    # The closed parameter is set to 'left' so that the current match is not included in the average.
    rolling_stats = data[cols].rolling(3, closed='left').mean()

    # Add the rolling averages to the dataframe.
    data[new_cols] = rolling_stats

    # The first two matches will have NaN values for the rolling averages, so we drop them.
    data = data.dropna(subset=new_cols)
    return data

Create new predictor columns.

In [101]:
matches_df['venue_code'] = matches_df['venue'].astype('category').cat.codes
matches_df['hour'] = matches_df['time'].str.replace(':.+', '', regex=True).astype('int')
matches_df['day_code'] = matches_df['date'].dt.dayofweek
matches_df['target'] = (matches_df['result'] == 'W').astype('int')

predictors = ['opponent_code', 'venue_code', 'hour', 'day_code']

# Create a dataframe for each team. We do this so we can compute rolling averages for each team.
grouped_matches_df = matches_df.groupby('team')
cols = [
    'gf',
    'ga',
    'shooting_standard_sh',
    'shooting_standard_sot',
    'shooting_standard_dist',
    'shooting_standard_fk',
    'shooting_standard_pk',
    'shooting_standard_pkatt'
]
new_cols = [f'{col}_rolling_avg' for col in cols]

# Apply the rolling_averages function to each team dataframe.
rolling_matches_df = grouped_matches_df.apply(lambda group: rolling_averages(group, cols, new_cols))
# Drop the team index level because we don't need it.
rolling_matches_df = rolling_matches_df.droplevel(0)
# Fix the index because there might be duplicate indices.
rolling_matches_df.index = range(len(rolling_matches_df))
predictors += new_cols
predictors

['opponent_code',
 'venue_code',
 'hour',
 'day_code',
 'gf_rolling_avg',
 'ga_rolling_avg',
 'shooting_standard_sh_rolling_avg',
 'shooting_standard_sot_rolling_avg',
 'shooting_standard_dist_rolling_avg',
 'shooting_standard_fk_rolling_avg',
 'shooting_standard_pk_rolling_avg',
 'shooting_standard_pkatt_rolling_avg']

Create a method to ease the process of predicting.

In [102]:
def make_predictions(data, predictors):
    """
    Make predictions using the random forest classifier.

    :param data: the dataframe to use
    :param predictors: the predictor columns
    :return: combined: a dataframe containing the actual and predicted values
    """
    # Split the data into train and test sets.
    train_set = data[data['date'] < '2022-01-01']
    test_set = data[data['date'] >= '2022-01-01']
    print(f'Train: {len(train_set)} matches ({len(train_set) / len(data):.2%})')
    print(f'Test: {len(test_set)} matches ({len(test_set) / len(data):.2%})')

    # Create and fit (train) the model.
    model = RandomForestClassifier(
        n_estimators=50,  # number of trees in the forest
        min_samples_split=2,  # number of samples required to split an internal node
        random_state=1  # seed
    )
    model.fit(train_set[predictors], train_set['target'])

    # Make predictions on the test dataset and calculate the precision score.
    predictions = model.predict(test_set[predictors])
    precision = precision_score(test_set['target'], predictions)
    print(f'Precision: {precision:.2%}')

    # Create a dataframe containing the actual and predicted values.
    combined = pd.DataFrame(
        dict(actual=test_set['target'], prediction=predictions),
        index=test_set.index
    )
    return combined

Make predictions on the matches dataframe.

In [103]:
combined = make_predictions(rolling_matches_df, predictors)

Train: 2676 matches (74.42%)
Test: 920 matches (25.58%)
Precision: 51.32%


In [104]:
matrix_df = combined.copy()
# Create a mapping dictionary for the replacements.
# We want to make the confusion matrix easier to read.
mapping = {0: 'Loss/Draw', 1: 'Win'}

# Apply the replacements to the 'actual' and 'prediction' columns.
matrix_df['actual'] = matrix_df['actual'].replace(mapping)
matrix_df['prediction'] = matrix_df['prediction'].replace(mapping)

# Create a confusion matrix.
pd.crosstab(index=matrix_df['actual'], columns=matrix_df['prediction'])

prediction,Loss/Draw,Win
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Loss/Draw,481,92
Win,250,97


Merge to see predictions.

In [105]:
combined = combined.merge(
    rolling_matches_df[['date', 'team', 'opponent', 'result']],
    left_index=True,
    right_index=True
)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
45,0,0,2022-01-08,Arminia,Freiburg,D
46,0,1,2022-01-16,Arminia,Greuther Furth,D
47,1,0,2022-01-21,Arminia,Eintracht Frankfurt,W
48,0,0,2022-02-05,Arminia,Monchengladbach,D
49,0,0,2022-02-13,Arminia,Hoffenheim,L
...,...,...,...,...,...,...
3591,1,0,2023-04-30,Wolfsburg,Mainz 05,W
3592,0,0,2023-05-07,Wolfsburg,Dortmund,L
3593,1,0,2023-05-13,Wolfsburg,Hoffenheim,W
3594,0,0,2023-05-19,Wolfsburg,Freiburg,L


The dataframe above has two rows per match - one containing the prediction for the home team, and one containing the prediction for the away team. We want to combine these two rows into one row per match.

In [106]:
merged = combined.merge(combined, left_on=['date', 'team'], right_on=['date', 'opponent'])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,actual_y,prediction_y,team_y,opponent_y,result_y
0,0,0,2022-01-08,Arminia,Freiburg,D,0,0,Freiburg,Arminia,D
1,0,1,2022-01-16,Arminia,Greuther Furth,D,0,0,Greuther Furth,Arminia,D
2,1,0,2022-01-21,Arminia,Eintracht Frankfurt,W,0,0,Eintracht Frankfurt,Arminia,L
3,0,0,2022-02-05,Arminia,Monchengladbach,D,0,0,Monchengladbach,Arminia,D
4,0,0,2022-02-13,Arminia,Hoffenheim,L,1,0,Hoffenheim,Arminia,W
...,...,...,...,...,...,...,...,...,...,...,...
911,1,0,2023-04-30,Wolfsburg,Mainz 05,W,0,0,Mainz 05,Wolfsburg,L
912,0,0,2023-05-07,Wolfsburg,Dortmund,L,1,0,Dortmund,Wolfsburg,W
913,1,0,2023-05-13,Wolfsburg,Hoffenheim,W,0,0,Hoffenheim,Wolfsburg,L
914,0,0,2023-05-19,Wolfsburg,Freiburg,L,1,0,Freiburg,Wolfsburg,W


Get the rows where we predicted that **the home team would win and the away team would lose**.

In [113]:
accuracy_df = merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()
accuracy = accuracy_df.iloc[0] / (accuracy_df.iloc[0] + accuracy_df.iloc[1]) * 100
print(f'Accuracy: {accuracy:.2f}%')

Accuracy: 54.88%
