# Training the model

Imports.

In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score

Pathing to the data.

In [33]:
data_path = '../data'
prepared_data_path = f'{data_path}/csv/prepared'

Open the prepared data.

In [34]:
matches_df = pd.read_csv(f'{prepared_data_path}/bundesliga_matches.csv')
matches_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling_avg,ga_rolling_avg,shooting_standard_sh_rolling_avg,shooting_standard_sot_rolling_avg,shooting_standard_dist_rolling_avg,shooting_standard_fk_rolling_avg,shooting_standard_pk_rolling_avg,shooting_standard_pkatt_rolling_avg
0,2020-10-17,18:30,Bundesliga,Matchweek 4,Sat,Home,L,1,4,Bayern Munich,...,5,0,0.666667,0.666667,9.333333,3.333333,22.233333,0.000000,0.0,0.000000
1,2020-10-25,15:30,Bundesliga,Matchweek 5,Sun,Away,L,1,2,Wolfsburg,...,6,0,0.666667,1.666667,9.000000,3.666667,21.400000,0.333333,0.0,0.000000
2,2020-10-31,15:30,Bundesliga,Matchweek 6,Sat,Home,L,0,2,Dortmund,...,5,0,0.666667,2.333333,9.666667,3.333333,20.566667,0.666667,0.0,0.000000
3,2020-11-07,15:30,Bundesliga,Matchweek 7,Sat,Away,L,0,5,Union Berlin,...,5,0,0.666667,2.666667,8.333333,2.666667,21.166667,0.666667,0.0,0.000000
4,2020-11-21,15:30,Bundesliga,Matchweek 8,Sat,Home,L,1,2,Bayer Leverkusen,...,5,0,0.333333,3.000000,6.666667,1.333333,21.266667,0.666667,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591,2023-04-30,17:30,Bundesliga,Matchweek 30,Sun,Home,W,3,0,Mainz 05,...,6,1,1.666667,1.000000,11.666667,4.000000,17.266667,0.000000,0.0,0.333333
3592,2023-05-07,17:30,Bundesliga,Matchweek 31,Sun,Away,L,0,6,Dortmund,...,6,0,2.666667,0.333333,11.666667,4.666667,15.500000,0.000000,0.0,0.333333
3593,2023-05-13,15:30,Bundesliga,Matchweek 32,Sat,Home,W,2,1,Hoffenheim,...,5,1,2.666667,2.333333,11.333333,4.666667,15.200000,0.000000,0.0,0.333333
3594,2023-05-19,20:30,Bundesliga,Matchweek 33,Fri,Away,L,0,2,Freiburg,...,4,0,1.666667,2.333333,9.666667,3.333333,17.033333,0.000000,0.0,0.000000


Create a method to ease the process of predicting.

In [35]:
def make_predictions(data, predictors):
    """
    Make predictions using the random forest classifier.

    :param data: the dataframe to use
    :param predictors: the predictor columns
    :return: combined: a dataframe containing the actual and predicted values
    """
    # Split the data into train and test sets.
    train_set = data[data['date'] < '2022-01-01']
    test_set = data[data['date'] >= '2022-01-01']
    print(f'Train: {len(train_set)} matches ({len(train_set) / len(data):.2%})')
    print(f'Test: {len(test_set)} matches ({len(test_set) / len(data):.2%})')

    # Create and fit (train) the model.
    model = RandomForestClassifier(
        n_estimators=50,  # number of trees in the forest
        min_samples_split=2,  # number of samples required to split an internal node
        random_state=1  # seed
    )
    model.fit(train_set[predictors], train_set['target'])

    # Make predictions on the test dataset and calculate the precision score.
    predictions = model.predict(test_set[predictors])
    precision = precision_score(test_set['target'], predictions)
    print(f'Precision: {precision:.2%}')

    # Create a dataframe containing the actual and predicted values.
    combined = pd.DataFrame(
        dict(actual=test_set['target'], prediction=predictions),
        index=test_set.index
    )
    return combined

Make predictions on the matches dataframe.

In [36]:
predictors = [
    'opponent_code',
    'venue_code',
    'day_code',
    'hour'
]
combined = make_predictions(matches_df, predictors)

Train: 2676 matches (74.42%)
Test: 920 matches (25.58%)
Precision: 50.92%


In [37]:
matrix_df = combined.copy()
# Create a mapping dictionary for the replacements.
# We want to make the confusion matrix easier to read.
mapping = {0: 'Loss/Draw', 1: 'Win'}

# Apply the replacements to the 'actual' and 'prediction' columns.
matrix_df['actual'] = matrix_df['actual'].replace(mapping)
matrix_df['prediction'] = matrix_df['prediction'].replace(mapping)

# Create a confusion matrix.
pd.crosstab(index=matrix_df['actual'], columns=matrix_df['prediction'])

prediction,Loss/Draw,Win
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Loss/Draw,466,107
Win,236,111


Merge to see predictions.

In [38]:
combined = combined.merge(
    matches_df[['date', 'team', 'opponent', 'result']],
    left_index=True,
    right_index=True
)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
45,0,0,2022-01-08,Arminia,Freiburg,D
46,0,0,2022-01-16,Arminia,Greuther Furth,D
47,1,0,2022-01-21,Arminia,Eintracht Frankfurt,W
48,0,0,2022-02-05,Arminia,Monchengladbach,D
49,0,0,2022-02-13,Arminia,Hoffenheim,L
...,...,...,...,...,...,...
3591,1,0,2023-04-30,Wolfsburg,Mainz 05,W
3592,0,0,2023-05-07,Wolfsburg,Dortmund,L
3593,1,0,2023-05-13,Wolfsburg,Hoffenheim,W
3594,0,0,2023-05-19,Wolfsburg,Freiburg,L


The dataframe above has two rows per match - one containing the prediction for the home team, and one containing the prediction for the away team. We want to combine these two rows into one row per match.

In [39]:
merged = combined.merge(combined, left_on=['date', 'team'], right_on=['date', 'opponent'])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,actual_y,prediction_y,team_y,opponent_y,result_y
0,0,0,2022-01-08,Arminia,Freiburg,D,0,0,Freiburg,Arminia,D
1,0,0,2022-01-16,Arminia,Greuther Furth,D,0,0,Greuther Furth,Arminia,D
2,1,0,2022-01-21,Arminia,Eintracht Frankfurt,W,0,1,Eintracht Frankfurt,Arminia,L
3,0,0,2022-02-05,Arminia,Monchengladbach,D,0,0,Monchengladbach,Arminia,D
4,0,0,2022-02-13,Arminia,Hoffenheim,L,1,0,Hoffenheim,Arminia,W
...,...,...,...,...,...,...,...,...,...,...,...
911,1,0,2023-04-30,Wolfsburg,Mainz 05,W,0,0,Mainz 05,Wolfsburg,L
912,0,0,2023-05-07,Wolfsburg,Dortmund,L,1,0,Dortmund,Wolfsburg,W
913,1,0,2023-05-13,Wolfsburg,Hoffenheim,W,0,0,Hoffenheim,Wolfsburg,L
914,0,0,2023-05-19,Wolfsburg,Freiburg,L,1,0,Freiburg,Wolfsburg,W


Get the rows where we predicted that **the home team would win and the away team would lose**.

In [40]:
accuracy_df = merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()
accuracy_df
# accuracy = accuracy_df.iloc[0] / (accuracy_df.iloc[0] + accuracy_df.iloc[1]) * 100
# print(f'Accuracy: {accuracy:.2f}%')

1    96
0    85
Name: actual_x, dtype: int64