# Preparing the data for training

Imports.

In [101]:
import pandas as pd

Pathing to the data.

In [102]:
data_path = '../data'
clean_data_path = f'{data_path}/csv/clean'
prepared_data_path = f'{data_path}/csv/prepared'

Open the cleaned data.

In [103]:
matches_df = pd.read_csv(f'{clean_data_path}/bundesliga_matches.csv')

Check types of columns to make sure they are numeric.

In [104]:
matches_df.dtypes

date                       object
time                       object
comp                       object
round                      object
day                        object
                           ...   
misc_aerial_duels_won     float64
misc_aerial_duels_lost    float64
misc_aerial_duels_won%    float64
pgf                       float64
pga                       float64
Length: 159, dtype: object

Select all numeric columns and create new columns that will contain the rolling averages.

In [105]:
# Pick all numeric columns.
cols = matches_df.select_dtypes(include='number').columns
# And create names for the new columns that will contain the rolling averages.
new_cols = [f'{col}_rolling_avg' for col in cols]

Convert `date` column to datetime.

In [106]:
matches_df['date'] = pd.to_datetime(matches_df['date'])
matches_df['date'].dtypes

dtype('<M8[ns]')

Convert `opponent` column to categorical.

In [107]:
matches_df['opponent_code'] = matches_df['opponent'].astype('category').cat.codes
matches_df[['opponent', 'opponent_code']].drop_duplicates()

Unnamed: 0,opponent,opponent_code
0,Arminia,0
1,Hertha BSC,12
2,Hoffenheim,13
3,Koln,14
4,Bayern Munich,3
5,Werder Bremen,23
6,Stuttgart,21
7,RB Leipzig,19
8,Union Berlin,22
9,Dortmund,5


Create a method to compute rolling averages. We will split the matches dataframe by team, because what we want to do is compute rolling averages for each team - how did this team perform in the past few games?

In [108]:
def rolling_averages(data, cols, new_cols):
    """
    Compute rolling averages for the specified columns.

    :param data: the dataframe to use.
    :param cols: the columns to compute rolling averages for.
    :param new_cols: the names of the new columns that will contain the rolling averages.
    :return: the dataframe with the new columns added.
    """
    # Sort by date because we want to look at the last 3 matches.
    data = data.sort_values('date')

    # Compute rolling averages for the specified columns.
    # The closed parameter is set to 'left' so that the current match is not included in the average.
    rolling_stats = data[cols].rolling(3, closed='left').mean()

    # Add the rolling averages to the dataframe.
    rolling_stats.columns = new_cols
    data = pd.concat([data, rolling_stats], axis=1)

    # The first three matches will have NaN values for the rolling averages, so we drop them.
    data = data.dropna(subset=new_cols)

    return data

Create new predictor columns.

In [109]:
matches_df['venue_code'] = matches_df['venue'].astype('category').cat.codes
matches_df['hour'] = matches_df['time'].str.replace(':.+', '', regex=True).astype('int')
matches_df['day_code'] = matches_df['date'].dt.dayofweek
matches_df['target'] = (matches_df['result'] == 'W').astype('int')

In [110]:
# Fill all NaN values with 0.
# We do this so lots of rows don't get deleted.
matches_df = matches_df.fillna(0)

# Create a dataframe for each team. We do this, so we can compute rolling averages for each team.
grouped_teams_df = matches_df.groupby('team')
# Apply the rolling_averages function to each team dataframe.
rolling_matches_df = grouped_teams_df.apply(lambda group: rolling_averages(group, cols, new_cols))
# Drop the team index level because we don't need it.
rolling_matches_df = rolling_matches_df.droplevel(0)
# Fix the index because there might be duplicate indices.
rolling_matches_df.index = range(len(rolling_matches_df))
rolling_matches_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,misc_performance_off_rolling_avg,misc_performance_pkwon_rolling_avg,misc_performance_pkcon_rolling_avg,misc_performance_og_rolling_avg,misc_performance_recov_rolling_avg,misc_aerial_duels_won_rolling_avg,misc_aerial_duels_lost_rolling_avg,misc_aerial_duels_won%_rolling_avg,pgf_rolling_avg,pga_rolling_avg
0,2020-10-17,18:30,Bundesliga,Matchweek 4,Sat,Home,L,1,4,Bayern Munich,...,1.333333,0.000000,0.000000,0.000000,51.333333,21.333333,18.666667,52.866667,0.0,0.0
1,2020-10-25,15:30,Bundesliga,Matchweek 5,Sun,Away,L,1,2,Wolfsburg,...,1.666667,0.000000,0.000000,0.000000,51.000000,18.666667,15.333333,53.900000,0.0,0.0
2,2020-10-31,15:30,Bundesliga,Matchweek 6,Sat,Home,L,0,2,Dortmund,...,1.666667,0.000000,0.000000,0.000000,54.000000,17.333333,18.333333,48.800000,0.0,0.0
3,2020-11-07,15:30,Bundesliga,Matchweek 7,Sat,Away,L,0,5,Union Berlin,...,1.333333,0.000000,0.000000,0.000000,54.666667,14.000000,16.000000,47.933333,0.0,0.0
4,2020-11-21,15:30,Bundesliga,Matchweek 8,Sat,Home,L,1,2,Bayer Leverkusen,...,1.000000,0.000000,0.333333,0.000000,49.000000,15.333333,20.333333,43.100000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3594,2023-04-30,17:30,Bundesliga,Matchweek 30,Sun,Home,W,3,0,Mainz 05,...,1.000000,0.333333,0.000000,0.000000,53.666667,9.666667,14.666667,41.400000,0.0,0.0
3595,2023-05-07,17:30,Bundesliga,Matchweek 31,Sun,Away,L,0,6,Dortmund,...,1.000000,0.333333,0.000000,0.000000,57.333333,11.333333,17.000000,40.366667,0.0,0.0
3596,2023-05-13,15:30,Bundesliga,Matchweek 32,Sat,Home,W,2,1,Hoffenheim,...,1.333333,0.333333,0.333333,0.000000,53.666667,11.333333,16.333333,42.433333,0.0,0.0
3597,2023-05-19,20:30,Bundesliga,Matchweek 33,Fri,Away,L,0,2,Freiburg,...,1.333333,0.000000,0.333333,0.333333,53.666667,14.333333,15.333333,47.700000,0.0,0.0


Save the prepared data.

In [111]:
rolling_matches_df.to_csv(f'{prepared_data_path}/bundesliga_matches.csv', index=False)