# Training the model

Imports.

In [365]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

Pathing to the data.

In [366]:
data_path = '../data'
clean_data_path = f'{data_path}/csv/clean'

Open cleaned data.

In [367]:
matches_df = pd.read_csv(f'{clean_data_path}/bundesliga_matches.csv')

In [368]:
matches_df['round'].value_counts()

Matchweek 1     110
Matchweek 4     110
Matchweek 5     110
Matchweek 2     110
Matchweek 3     110
Matchweek 33    108
Matchweek 32    108
Matchweek 31    108
Matchweek 30    108
Matchweek 20    108
Matchweek 28    108
Matchweek 26    108
Matchweek 25    108
Matchweek 24    108
Matchweek 23    108
Matchweek 22    108
Matchweek 21    108
Matchweek 29    108
Matchweek 18    108
Matchweek 19    108
Matchweek 17    108
Matchweek 16    108
Matchweek 15    108
Matchweek 14    108
Matchweek 13    108
Matchweek 12    108
Matchweek 11    108
Matchweek 10    108
Matchweek 9     108
Matchweek 8     108
Matchweek 7     108
Matchweek 6     108
Matchweek 34    108
Matchweek 27    106
Name: round, dtype: int64

In [369]:
matches_df.dtypes

date                       object
time                       object
comp                       object
round                      object
day                        object
                           ...   
misc_aerial_duels_won     float64
misc_aerial_duels_lost    float64
misc_aerial_duels_won%    float64
pgf                       float64
pga                       float64
Length: 159, dtype: object

In [370]:
matches_df['date'] = pd.to_datetime(matches_df['date'])
matches_df.dtypes

date                      datetime64[ns]
time                              object
comp                              object
round                             object
day                               object
                               ...      
misc_aerial_duels_won            float64
misc_aerial_duels_lost           float64
misc_aerial_duels_won%           float64
pgf                              float64
pga                              float64
Length: 159, dtype: object

Create new predictor columns.

In [371]:
matches_df['opponent_code'] = matches_df['opponent'].astype('category').cat.codes
matches_df[['opponent', 'opponent_code']].drop_duplicates()

Unnamed: 0,opponent,opponent_code
0,Arminia,0
1,Hertha BSC,12
2,Hoffenheim,13
3,Koln,14
4,Bayern Munich,3
5,Werder Bremen,23
6,Stuttgart,21
7,RB Leipzig,19
8,Union Berlin,22
9,Dortmund,5


In [372]:
matches_df['venue_code'] = matches_df['venue'].astype('category').cat.codes
matches_df['hour'] = matches_df['time'].str.replace(':.+', '', regex=True).astype('int')
matches_df['day_code'] = matches_df['date'].dt.dayofweek
matches_df['target'] = (matches_df['result'] == 'W').astype('int')

predictors = ['opponent_code', 'venue_code', 'hour', 'day_code']

Split data into train and test sets.

In [373]:
train = matches_df[matches_df['date'] < '2023-01-01']
test = matches_df[matches_df['date'] >= '2023-01-01']

print(f'Train: {len(train)} ({len(train) / len(matches_df):.2%})')
print(f'Test: {len(test)} ({len(test) / len(matches_df):.2%})')

Train: 3328 (90.43%)
Test: 352 (9.57%)


Create and train model.

In [374]:
rf = RandomForestClassifier(
    n_estimators=50,  # number of trees in the forest
    min_samples_split=2,  # number of samples required to split an internal node
    random_state=1  # seed
)
rf.fit(train[predictors], train['target'])

Test model.

In [375]:
predictions = rf.predict(test[predictors])
accuracy = accuracy_score(test['target'], predictions)
print(f'Accuracy: {accuracy:.2%}')

Accuracy: 63.07%


Create a confusion matrix.

In [376]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=predictions))
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,184,40
1,90,38


When predicting a draw or loss, we were right 181 times and wrong 93 times.
When predicting a win, we were right 35 times and wrong 43 times.

In [377]:
precision = precision_score(test['target'], predictions)
print(f'Precision: {precision:.2%}')

Precision: 48.72%


Our precision is only 44.87% which is not ideal.

Let's create more predictors in order to improve the accuracy of the model.

We will split the matches dataframe by team, because what we want to do is compute rolling averages for each team - how did this team perform in the past few games?

In [378]:
# Create a dataframe for each team.
grouped_matches_df = matches_df.groupby('team')

In [379]:
def rolling_averages(group, cols, new_cols):
    # Sort by date because we want to look at the last 3 matches.
    group = group.sort_values('date')
    # Compute rolling averages for the specified columns.
    # The closed parameter is set to 'left' so that the current match is not included in the average.
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    # Add the rolling averages to the dataframe.
    group[new_cols] = rolling_stats
    # The first two matches will have NaN values for the rolling averages, so we drop them.
    group = group.dropna(subset=new_cols)
    return group

In [380]:
cols = [
    'gf',
    'ga',
    'shooting_standard_sh',
    'shooting_standard_sot',
    'shooting_standard_dist',
    'shooting_standard_fk',
    'shooting_standard_pk',
    'shooting_standard_pkatt'
]
new_cols = [f'{col}_rolling_avg' for col in cols]
new_cols

['gf_rolling_avg',
 'ga_rolling_avg',
 'shooting_standard_sh_rolling_avg',
 'shooting_standard_sot_rolling_avg',
 'shooting_standard_dist_rolling_avg',
 'shooting_standard_fk_rolling_avg',
 'shooting_standard_pk_rolling_avg',
 'shooting_standard_pkatt_rolling_avg']

In [381]:
# Apply the rolling_averages function to each team dataframe.
rolling_matches_df = grouped_matches_df.apply(lambda group: rolling_averages(group, cols, new_cols))
rolling_matches_df

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling_avg,ga_rolling_avg,shooting_standard_sh_rolling_avg,shooting_standard_sot_rolling_avg,shooting_standard_dist_rolling_avg,shooting_standard_fk_rolling_avg,shooting_standard_pk_rolling_avg,shooting_standard_pkatt_rolling_avg
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arminia,828,2020-10-17,18:30,Bundesliga,Matchweek 4,Sat,Home,L,1,4,Bayern Munich,...,5,0,0.666667,0.666667,9.333333,3.333333,22.233333,0.000000,0.0,0.000000
Arminia,829,2020-10-25,15:30,Bundesliga,Matchweek 5,Sun,Away,L,1,2,Wolfsburg,...,6,0,0.666667,1.666667,9.000000,3.666667,21.400000,0.333333,0.0,0.000000
Arminia,830,2020-10-31,15:30,Bundesliga,Matchweek 6,Sat,Home,L,0,2,Dortmund,...,5,0,0.666667,2.333333,9.666667,3.333333,20.566667,0.666667,0.0,0.000000
Arminia,831,2020-11-07,15:30,Bundesliga,Matchweek 7,Sat,Away,L,0,5,Union Berlin,...,5,0,0.666667,2.666667,8.333333,2.666667,21.166667,0.666667,0.0,0.000000
Arminia,832,2020-11-21,15:30,Bundesliga,Matchweek 8,Sat,Home,L,1,2,Bayer Leverkusen,...,5,0,0.333333,3.000000,6.666667,1.333333,21.266667,0.666667,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolfsburg,2010,2023-04-30,17:30,Bundesliga,Matchweek 30,Sun,Home,W,3,0,Mainz 05,...,6,1,1.666667,1.000000,11.666667,4.000000,17.266667,0.000000,0.0,0.333333
Wolfsburg,2011,2023-05-07,17:30,Bundesliga,Matchweek 31,Sun,Away,L,0,6,Dortmund,...,6,0,2.666667,0.333333,11.666667,4.666667,15.500000,0.000000,0.0,0.333333
Wolfsburg,2012,2023-05-13,15:30,Bundesliga,Matchweek 32,Sat,Home,W,2,1,Hoffenheim,...,5,1,2.666667,2.333333,11.333333,4.666667,15.200000,0.000000,0.0,0.333333
Wolfsburg,2013,2023-05-19,20:30,Bundesliga,Matchweek 33,Fri,Away,L,0,2,Freiburg,...,4,0,1.666667,2.333333,9.666667,3.333333,17.033333,0.000000,0.0,0.000000


Drop the team index level because we don't need it.

In [382]:
rolling_matches_df = rolling_matches_df.droplevel(0)
rolling_matches_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling_avg,ga_rolling_avg,shooting_standard_sh_rolling_avg,shooting_standard_sot_rolling_avg,shooting_standard_dist_rolling_avg,shooting_standard_fk_rolling_avg,shooting_standard_pk_rolling_avg,shooting_standard_pkatt_rolling_avg
828,2020-10-17,18:30,Bundesliga,Matchweek 4,Sat,Home,L,1,4,Bayern Munich,...,5,0,0.666667,0.666667,9.333333,3.333333,22.233333,0.000000,0.0,0.000000
829,2020-10-25,15:30,Bundesliga,Matchweek 5,Sun,Away,L,1,2,Wolfsburg,...,6,0,0.666667,1.666667,9.000000,3.666667,21.400000,0.333333,0.0,0.000000
830,2020-10-31,15:30,Bundesliga,Matchweek 6,Sat,Home,L,0,2,Dortmund,...,5,0,0.666667,2.333333,9.666667,3.333333,20.566667,0.666667,0.0,0.000000
831,2020-11-07,15:30,Bundesliga,Matchweek 7,Sat,Away,L,0,5,Union Berlin,...,5,0,0.666667,2.666667,8.333333,2.666667,21.166667,0.666667,0.0,0.000000
832,2020-11-21,15:30,Bundesliga,Matchweek 8,Sat,Home,L,1,2,Bayer Leverkusen,...,5,0,0.333333,3.000000,6.666667,1.333333,21.266667,0.666667,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010,2023-04-30,17:30,Bundesliga,Matchweek 30,Sun,Home,W,3,0,Mainz 05,...,6,1,1.666667,1.000000,11.666667,4.000000,17.266667,0.000000,0.0,0.333333
2011,2023-05-07,17:30,Bundesliga,Matchweek 31,Sun,Away,L,0,6,Dortmund,...,6,0,2.666667,0.333333,11.666667,4.666667,15.500000,0.000000,0.0,0.333333
2012,2023-05-13,15:30,Bundesliga,Matchweek 32,Sat,Home,W,2,1,Hoffenheim,...,5,1,2.666667,2.333333,11.333333,4.666667,15.200000,0.000000,0.0,0.333333
2013,2023-05-19,20:30,Bundesliga,Matchweek 33,Fri,Away,L,0,2,Freiburg,...,4,0,1.666667,2.333333,9.666667,3.333333,17.033333,0.000000,0.0,0.000000


Fix the index because there might be duplicate indices.

In [383]:
rolling_matches_df.index = range(len(rolling_matches_df))
rolling_matches_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling_avg,ga_rolling_avg,shooting_standard_sh_rolling_avg,shooting_standard_sot_rolling_avg,shooting_standard_dist_rolling_avg,shooting_standard_fk_rolling_avg,shooting_standard_pk_rolling_avg,shooting_standard_pkatt_rolling_avg
0,2020-10-17,18:30,Bundesliga,Matchweek 4,Sat,Home,L,1,4,Bayern Munich,...,5,0,0.666667,0.666667,9.333333,3.333333,22.233333,0.000000,0.0,0.000000
1,2020-10-25,15:30,Bundesliga,Matchweek 5,Sun,Away,L,1,2,Wolfsburg,...,6,0,0.666667,1.666667,9.000000,3.666667,21.400000,0.333333,0.0,0.000000
2,2020-10-31,15:30,Bundesliga,Matchweek 6,Sat,Home,L,0,2,Dortmund,...,5,0,0.666667,2.333333,9.666667,3.333333,20.566667,0.666667,0.0,0.000000
3,2020-11-07,15:30,Bundesliga,Matchweek 7,Sat,Away,L,0,5,Union Berlin,...,5,0,0.666667,2.666667,8.333333,2.666667,21.166667,0.666667,0.0,0.000000
4,2020-11-21,15:30,Bundesliga,Matchweek 8,Sat,Home,L,1,2,Bayer Leverkusen,...,5,0,0.333333,3.000000,6.666667,1.333333,21.266667,0.666667,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591,2023-04-30,17:30,Bundesliga,Matchweek 30,Sun,Home,W,3,0,Mainz 05,...,6,1,1.666667,1.000000,11.666667,4.000000,17.266667,0.000000,0.0,0.333333
3592,2023-05-07,17:30,Bundesliga,Matchweek 31,Sun,Away,L,0,6,Dortmund,...,6,0,2.666667,0.333333,11.666667,4.666667,15.500000,0.000000,0.0,0.333333
3593,2023-05-13,15:30,Bundesliga,Matchweek 32,Sat,Home,W,2,1,Hoffenheim,...,5,1,2.666667,2.333333,11.333333,4.666667,15.200000,0.000000,0.0,0.333333
3594,2023-05-19,20:30,Bundesliga,Matchweek 33,Fri,Away,L,0,2,Freiburg,...,4,0,1.666667,2.333333,9.666667,3.333333,17.033333,0.000000,0.0,0.000000


In [384]:
def make_predictions(data, predictors):
    train = data[data['date'] < '2023-02-01']
    test = data[data['date'] >= '2023-02-01']
    rf.fit(train[predictors], train['target'])
    predictions = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], prediction=predictions), index=test.index)
    precision = precision_score(test['target'], predictions)
    return combined, precision

In [385]:
combined, precision = make_predictions(rolling_matches_df, predictors + new_cols)
print(f'Precision: {precision:.2%}')

Precision: 61.22%


Merge to see predictions.

In [386]:
combined = combined.merge(rolling_matches_df[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)
combined

Unnamed: 0,actual,prediction,date,team,opponent,result
247,1,0,2023-02-03,Augsburg,Bayer Leverkusen,W
248,0,0,2023-02-11,Augsburg,Mainz 05,L
249,1,1,2023-02-17,Augsburg,Hoffenheim,W
250,0,0,2023-02-25,Augsburg,Hertha BSC,L
251,1,1,2023-03-04,Augsburg,Werder Bremen,W
...,...,...,...,...,...,...
3591,1,0,2023-04-30,Wolfsburg,Mainz 05,W
3592,0,0,2023-05-07,Wolfsburg,Dortmund,L
3593,1,0,2023-05-13,Wolfsburg,Hoffenheim,W
3594,0,0,2023-05-19,Wolfsburg,Freiburg,L


In [387]:
merged = combined.merge(combined, left_on=['date', 'team'], right_on=['date', 'opponent'])
merged

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,actual_y,prediction_y,team_y,opponent_y,result_y
0,1,0,2023-02-03,Augsburg,Bayer Leverkusen,W,0,0,Bayer Leverkusen,Augsburg,L
1,0,0,2023-02-11,Augsburg,Mainz 05,L,1,0,Mainz 05,Augsburg,W
2,1,1,2023-02-17,Augsburg,Hoffenheim,W,0,0,Hoffenheim,Augsburg,L
3,0,0,2023-02-25,Augsburg,Hertha BSC,L,1,0,Hertha BSC,Augsburg,W
4,1,1,2023-03-04,Augsburg,Werder Bremen,W,0,0,Werder Bremen,Augsburg,L
...,...,...,...,...,...,...,...,...,...,...,...
283,1,0,2023-04-30,Wolfsburg,Mainz 05,W,0,0,Mainz 05,Wolfsburg,L
284,0,0,2023-05-07,Wolfsburg,Dortmund,L,1,0,Dortmund,Wolfsburg,W
285,1,0,2023-05-13,Wolfsburg,Hoffenheim,W,0,0,Hoffenheim,Wolfsburg,L
286,0,0,2023-05-19,Wolfsburg,Freiburg,L,1,0,Freiburg,Wolfsburg,W


In [388]:
merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()

1    29
0    17
Name: actual_x, dtype: int64

In [394]:
print(f'{29 / (29 + 17) * 100:.2f}%')

63.04%
