# Preparing the data for training

Imports.

In [76]:
import pandas as pd

Pathing to the data.

In [77]:
data_path = '../data'
clean_data_path = f'{data_path}/csv/clean'
prepared_data_path = f'{data_path}/csv/prepared'

Open the cleaned data.

In [78]:
matches_df = pd.read_csv(f'{clean_data_path}/bundesliga_matches.csv')

Check types of columns to make sure they are numeric.

In [79]:
matches_df.dtypes

date                       object
time                       object
comp                       object
round                      object
day                        object
                           ...   
misc_aerial_duels_won     float64
misc_aerial_duels_lost    float64
misc_aerial_duels_won%    float64
pgf                       float64
pga                       float64
Length: 159, dtype: object

Convert `date` column to datetime.

In [80]:
matches_df['date'] = pd.to_datetime(matches_df['date'])
matches_df['date'].dtypes

dtype('<M8[ns]')

Convert `opponent` column to categorical.

In [81]:
matches_df['opponent_code'] = matches_df['opponent'].astype('category').cat.codes
matches_df[['opponent', 'opponent_code']].drop_duplicates()

Unnamed: 0,opponent,opponent_code
0,Arminia,0
1,Hertha BSC,12
2,Hoffenheim,13
3,Koln,14
4,Bayern Munich,3
5,Werder Bremen,23
6,Stuttgart,21
7,RB Leipzig,19
8,Union Berlin,22
9,Dortmund,5


Create a method to compute rolling averages. We will split the matches dataframe by team, because what we want to do is compute rolling averages for each team - how did this team perform in the past few games?

In [82]:
def rolling_averages(data, cols, new_cols):
    """
    Compute rolling averages for the specified columns.

    :param data: the dataframe to use.
    :param cols: the columns to compute rolling averages for.
    :param new_cols: the names of the new columns that will contain the rolling averages.
    :return: the dataframe with the new columns added.
    """
    # Sort by date because we want to look at the last 3 matches.
    data = data.sort_values('date')

    # Compute rolling averages for the specified columns.
    # The closed parameter is set to 'left' so that the current match is not included in the average.
    rolling_stats = data[cols].rolling(3, closed='left').mean()

    # Add the rolling averages to the dataframe.
    data[new_cols] = rolling_stats

    # The first two matches will have NaN values for the rolling averages, so we drop them.
    data = data.dropna(subset=new_cols)
    return data

Create new predictor columns.

In [83]:
matches_df['venue_code'] = matches_df['venue'].astype('category').cat.codes
matches_df['hour'] = matches_df['time'].str.replace(':.+', '', regex=True).astype('int')
matches_df['day_code'] = matches_df['date'].dt.dayofweek
matches_df['target'] = (matches_df['result'] == 'W').astype('int')

In [84]:
# Create a dataframe for each team. We do this, so we can compute rolling averages for each team.
grouped_teams_df = matches_df.groupby('team')
# TODO: Do this for all columns.
cols = [
    'gf',
    'ga',
    'shooting_standard_sh',
    'shooting_standard_sot',
    'shooting_standard_dist',
    'shooting_standard_fk',
    'shooting_standard_pk',
    'shooting_standard_pkatt'
]
new_cols = [f'{col}_rolling_avg' for col in cols]

# Apply the rolling_averages function to each team dataframe.
matches_df = grouped_teams_df.apply(lambda group: rolling_averages(group, cols, new_cols))
# Drop the team index level because we don't need it.
matches_df = matches_df.droplevel(0)
# Fix the index because there might be duplicate indices.
matches_df.index = range(len(matches_df))
matches_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling_avg,ga_rolling_avg,shooting_standard_sh_rolling_avg,shooting_standard_sot_rolling_avg,shooting_standard_dist_rolling_avg,shooting_standard_fk_rolling_avg,shooting_standard_pk_rolling_avg,shooting_standard_pkatt_rolling_avg
0,2020-10-17,18:30,Bundesliga,Matchweek 4,Sat,Home,L,1,4,Bayern Munich,...,5,0,0.666667,0.666667,9.333333,3.333333,22.233333,0.000000,0.0,0.000000
1,2020-10-25,15:30,Bundesliga,Matchweek 5,Sun,Away,L,1,2,Wolfsburg,...,6,0,0.666667,1.666667,9.000000,3.666667,21.400000,0.333333,0.0,0.000000
2,2020-10-31,15:30,Bundesliga,Matchweek 6,Sat,Home,L,0,2,Dortmund,...,5,0,0.666667,2.333333,9.666667,3.333333,20.566667,0.666667,0.0,0.000000
3,2020-11-07,15:30,Bundesliga,Matchweek 7,Sat,Away,L,0,5,Union Berlin,...,5,0,0.666667,2.666667,8.333333,2.666667,21.166667,0.666667,0.0,0.000000
4,2020-11-21,15:30,Bundesliga,Matchweek 8,Sat,Home,L,1,2,Bayer Leverkusen,...,5,0,0.333333,3.000000,6.666667,1.333333,21.266667,0.666667,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3591,2023-04-30,17:30,Bundesliga,Matchweek 30,Sun,Home,W,3,0,Mainz 05,...,6,1,1.666667,1.000000,11.666667,4.000000,17.266667,0.000000,0.0,0.333333
3592,2023-05-07,17:30,Bundesliga,Matchweek 31,Sun,Away,L,0,6,Dortmund,...,6,0,2.666667,0.333333,11.666667,4.666667,15.500000,0.000000,0.0,0.333333
3593,2023-05-13,15:30,Bundesliga,Matchweek 32,Sat,Home,W,2,1,Hoffenheim,...,5,1,2.666667,2.333333,11.333333,4.666667,15.200000,0.000000,0.0,0.333333
3594,2023-05-19,20:30,Bundesliga,Matchweek 33,Fri,Away,L,0,2,Freiburg,...,4,0,1.666667,2.333333,9.666667,3.333333,17.033333,0.000000,0.0,0.000000


Save the prepared data.

In [85]:
matches_df.to_csv(f'{prepared_data_path}/bundesliga_matches.csv', index=False)