Author: Roel Faber

Goal of this script: take the raw matchdata from Eredivisiedata.ipynb and use it to create useful features, such as match history, standings, form, etc.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set()

In [None]:
df = pd.read_csv('Data/matches.csv',index_col=0)
df.head()

# Compute columns

In [None]:
df['GoalsScored'] = df['HomeGoals'] + df['AwayGoals']

In [None]:
df['HomeAdvantage'] = df['HomeGoals'] - df['AwayGoals']

# Get more informative features

Features to include:

1) Standings in previous season(s)

2) Recent results against the opponent

3) Recent form (results of previous matches)

## Standings in seasons

In [None]:
standings_df = pd.DataFrame(columns=df['Season'].unique(),index=df['Home'].unique())
points_df = pd.DataFrame(columns=df['Season'].unique(),index=df['Home'].unique())

def get_standings(seasondf,seasonname):
    """Get the final results table for a season
    
    Parameters
    -------------
    seasondf, pd.DataFrame
        Dataframe containing the matches in a season
        
    seasonname, str
        string of season, should match value in seasondf"""
    teamlist = seasondf['Home'].unique()
    standings = pd.DataFrame(index=teamlist,columns=range(1,35))
    for index, row in seasondf.iterrows():
        standings.loc[row['Home'],row['Round']] = row['HomePoints']
        standings.loc[row['Away'],row['Round']] = row['AwayPoints']
    standings['Total'] = standings.sum(axis=1)
    standings['Total'].rank(ascending=False).sort_values()
    standings['Final'] = standings['Total'].rank(ascending=False).sort_values()
    for index,row in standings.iterrows():
        standings_df.at[index,seasonname] = row['Final']
        points_df.at[index,seasonname] = row['Total']

In [None]:
%%time
for season in df['Season'].unique():
    seasondf = df[df['Season']==season]
    get_standings(seasondf,season)

In [None]:
standings_df.to_csv('Data/final_standings.csv')
points_df.to_csv('Data/final_points.csv')

In [None]:
standings_df = pd.read_csv('Data/final_standings.csv')
points_df = pd.read_csv('Data/final_points.csv')

## Add information to matches df

In [None]:
df['standings_last_season'] = np.nan
df['standings_this_season'] = np.nan
df['standings_form'] = np.nan
df['points_last_season'] = np.nan
df['points_this_season'] = np.nan
df['points_form'] = np.nan

In [None]:
df.head()

## Results from last 5 matches

In [None]:
df.head()

In [None]:
def get_form(df, team, window, outputcol):
    teamdf = df.loc[(df.Home==team) | (df.Away==team)]
    pointslist = []
    for index, row in df.iterrows():
        if row.Home==team:
            pointslist.append(row.HomePoints)
        elif row.Away==team:
            pointslist.append(row.AwayPoints)
    teamdf.loc[:,'TeamPoints'] = pointslist
    teamdf.loc[:,outputcol] = teamdf.TeamPoints.rolling(window=window).sum()
    df.loc[teamdf[teamdf.Home==team].index,'Home_'+outputcol] = teamdf.loc[:,outputcol]
    df.loc[teamdf[teamdf.Away==team].index,'Away_'+outputcol] = teamdf.loc[:,outputcol]

In [None]:
get_form(df, 'AFC Ajax', 5, 'RecentForm')

In [None]:
# get_form_opponent(df, team, opponent, window, outputcol):
#     Optie 1
#     ---------------
#     establish recentdf
#     pointslist
#     toevoegen aan df
#     berekenen rolling som
#     toewijzen aan alle combinaties
    
#     Optie 2
#     ----------------
#     Itereer met functie over rijen heen
    
    
    

In [None]:
team = 'AFC Ajax'
opponent = 'PSV'

In [None]:
teamdf = df.loc[((df.Home==team)&]

In [None]:
teamlist = df.Home.unique()

In [None]:
%%time
for team in teamlist:
    get_form(df, team, 5, 'RecentForm')

In [None]:
df = df.drop('RecentForm',axis=1)

In [None]:
df.to_csv('Data/matcheswithform.csv')

In [None]:
df = pd.read_csv('Data/matcheswithform.csv', index_col = 0)

In [None]:
df