# Training the model

Imports.

In [213]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from remove_collinear_features import remove_collinear_features

Pathing to the data.

In [214]:
data_path = '../data'
prepared_data_path = f'{data_path}/csv/prepared'

Open the prepared data.

In [215]:
matches_df = pd.read_csv(f'{prepared_data_path}/bundesliga_matches.csv')
matches_df.dtypes

date                                   object
time                                   object
comp                                   object
round                                  object
day                                    object
                                       ...   
misc_aerial_duels_won_rolling_avg     float64
misc_aerial_duels_lost_rolling_avg    float64
misc_aerial_duels_won%_rolling_avg    float64
pgf_rolling_avg                       float64
pga_rolling_avg                       float64
Length: 311, dtype: object

Create a method to ease the process of predicting.

In [216]:
def make_predictions(data, predictors):
    """
    Make predictions using the random forest classifier.

    :param data: the dataframe to use
    :param predictors: the predictor columns
    :return: combined: a dataframe containing the actual and predicted values
    """
    # Split the data into train and test sets.
    train_set = data[data['date'] < '2023-04-01']
    test_set = data[data['date'] >= '2023-04-01']
    print(f'Train: {len(train_set)} matches ({len(train_set) / len(data):.2%})')
    print(f'Test: {len(test_set)} matches ({len(test_set) / len(data):.2%})')

    # Create and fit (train) the model.
    model = RandomForestClassifier(
        n_estimators=50,  # number of trees in the forest
        min_samples_split=10,  # number of samples required to split an internal node
        random_state=1  # seed
    )
    model.fit(train_set[predictors], train_set['target'])

    # Make predictions on the test dataset and calculate the precision score.
    predictions = model.predict(test_set[predictors])
    precision = precision_score(test_set['target'], predictions)
    print(f'Precision: {precision:.2%}')

    # Create a dataframe containing the actual and predicted values.
    combined = pd.DataFrame(
        dict(actual=test_set['target'], prediction=predictions),
        index=test_set.index
    )
    return combined

Remove collinear features so we can make the model more generalized.

In [217]:
matches_df = remove_collinear_features(matches_df, 0.95)

  corr_matrix = df.corr()


Removed Columns {'passing_total_att', 'possession_receiving_prgr_rolling_avg', 'passing_short_att', 'possession_touches_live_rolling_avg', 'possession_touches_live', 'gca_gca_types_gca_rolling_avg', 'passing_short_att_rolling_avg', 'gca_sca_types_sca', 'shooting_expected_npxg_rolling_avg', 'passing_medium_cmp_rolling_avg', 'possession_receiving_rec_rolling_avg', 'passing_medium_att_rolling_avg', 'possession_carries_prgdist', 'passing_types_pass_types_live', 'gca_gca_types_gca', 'keeper_passes_avglen', 'misc_performance_off_rolling_avg', 'shooting_expected_np:g-xg', 'passing_total_totdist_rolling_avg', 'misc_performance_off', 'passing_medium_att', 'shooting_expected_np:g-xg_rolling_avg', 'passing_total_att_rolling_avg', 'possession_receiving_rec', 'possession_receiving_prgr', 'passing_types_pass_types_live_rolling_avg', 'passing_total_totdist', 'keeper_passes_avglen_rolling_avg', 'misc_performance_pkcon_rolling_avg', 'passing_medium_cmp', 'shooting_standard_gls_rolling_avg', 'gca_sca_ty

Make predictions on the matches dataframe.

In [218]:
predictors = ['opponent_code',
              'venue_code',
              'hour',
              'day_code' ]
predictors += [col for col in matches_df.columns if '_rolling_avg' in col]
combined = make_predictions(matches_df, predictors)

Train: 3435 matches (95.44%)
Test: 164 matches (4.56%)
Precision: 62.07%


In [219]:
matrix_df = combined.copy()
# Create a mapping dictionary for the replacements.
# We want to make the confusion matrix easier to read.
mapping = {0: 'Loss/Draw', 1: 'Win'}

# Apply the replacements to the 'actual' and 'prediction' columns.
matrix_df['actual'] = matrix_df['actual'].replace(mapping)
matrix_df['prediction'] = matrix_df['prediction'].replace(mapping)

# Create a confusion matrix.
pd.crosstab(index=matrix_df['actual'], columns=matrix_df['prediction'])

prediction,Loss/Draw,Win
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Loss/Draw,93,11
Win,42,18


Merge to see predictions.

In [220]:
combined = combined.merge(
    matches_df[['date', 'team', 'opponent', 'result']],
    left_index=True,
    right_index=True
)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
257,0,0,2023-04-01,Augsburg,Wolfsburg,D
258,0,0,2023-04-08,Augsburg,Koln,L
259,0,0,2023-04-15,Augsburg,RB Leipzig,L
260,0,0,2023-04-21,Augsburg,Stuttgart,D
261,0,0,2023-04-29,Augsburg,Eintracht Frankfurt,D
...,...,...,...,...,...,...
3594,1,0,2023-04-30,Wolfsburg,Mainz 05,W
3595,0,0,2023-05-07,Wolfsburg,Dortmund,L
3596,1,0,2023-05-13,Wolfsburg,Hoffenheim,W
3597,0,0,2023-05-19,Wolfsburg,Freiburg,L


The dataframe above has two rows per match - one containing the prediction for the home team, and one containing the prediction for the away team. We want to combine these two rows into one row per match.

In [221]:
merged = combined.merge(combined, left_on=['date', 'team'], right_on=['date', 'opponent'])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,actual_y,prediction_y,team_y,opponent_y,result_y
0,0,0,2023-04-01,Augsburg,Wolfsburg,D,0,0,Wolfsburg,Augsburg,D
1,0,0,2023-04-08,Augsburg,Koln,L,1,0,Koln,Augsburg,W
2,0,0,2023-04-15,Augsburg,RB Leipzig,L,1,0,RB Leipzig,Augsburg,W
3,0,0,2023-04-21,Augsburg,Stuttgart,D,0,0,Stuttgart,Augsburg,D
4,0,0,2023-04-29,Augsburg,Eintracht Frankfurt,D,0,1,Eintracht Frankfurt,Augsburg,D
...,...,...,...,...,...,...,...,...,...,...,...
155,1,0,2023-04-30,Wolfsburg,Mainz 05,W,0,0,Mainz 05,Wolfsburg,L
156,0,0,2023-05-07,Wolfsburg,Dortmund,L,1,1,Dortmund,Wolfsburg,W
157,1,0,2023-05-13,Wolfsburg,Hoffenheim,W,0,0,Hoffenheim,Wolfsburg,L
158,0,0,2023-05-19,Wolfsburg,Freiburg,L,1,0,Freiburg,Wolfsburg,W


Get the rows where we predicted that **the home team would win and the away team would lose**.

In [222]:
precision_df = merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()
precision = precision_df.iloc[0] / (precision_df.iloc[0] + precision_df.iloc[1]) * 100
print(f'Precision: {precision:.2f}%')

Precision: 62.96%
