Author: Roel Faber

Goal of this script: take the raw matchdata from Eredivisiedata.ipynb and use it to create useful features, such as match history, standings, form, etc.

In [34]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set()

In [2]:
df = pd.read_csv('Data/matches.csv',index_col=0)
df.head()

Unnamed: 0,Season,Round,Home,Away,HomeGoals,AwayGoals,Result,HomePoints,AwayPoints
0,1959-1960,1,ADO Den Haag,Willem II,3,3,3,1,1
1,1959-1960,1,AFC Ajax,NAC Breda,3,0,1,3,0
2,1959-1960,1,Blauw Wit,Sittardia,5,2,1,3,0
3,1959-1960,1,USV Elinkwijk,MVV Maastricht,2,0,1,3,0
4,1959-1960,1,Feijenoord,Sparta Rotterdam,0,1,2,0,3


# Compute columns

In [3]:
df['GoalsScored'] = df['HomeGoals'] + df['AwayGoals']

In [4]:
df['HomeAdvantage'] = df['HomeGoals'] - df['AwayGoals']

# Get more informative features

Features to include:

1) Standings in previous season(s)

2) Recent results against the opponent

3) Recent form (results of previous matches)

## Standings in seasons

In [108]:
standings_df = pd.DataFrame(columns=df['Season'].unique(),index=df['Home'].unique())
points_df = pd.DataFrame(columns=df['Season'].unique(),index=df['Home'].unique())

def get_standings(seasondf,seasonname):
    """Get the final results table for a season
    
    Parameters
    -------------
    seasondf, pd.DataFrame
        Dataframe containing the matches in a season
        
    seasonname, str
        string of season, should match value in seasondf"""
    teamlist = seasondf['Home'].unique()
    standings = pd.DataFrame(index=teamlist,columns=range(1,35))
    for index, row in seasondf.iterrows():
        standings.loc[row['Home'],row['Round']] = row['HomePoints']
        standings.loc[row['Away'],row['Round']] = row['AwayPoints']
    standings['Total'] = standings.sum(axis=1)
    standings['Total'].rank(ascending=False).sort_values()
    standings['Final'] = standings['Total'].rank(ascending=False).sort_values()
    for index,row in standings.iterrows():
        standings_df.at[index,seasonname] = row['Final']
        points_df.at[index,seasonname] = row['Total']

In [109]:
%%time
for season in df['Season'].unique():
    seasondf = df[df['Season']==season]
    get_standings(seasondf,season)

Wall time: 8.97 s


In [112]:
standings_df.to_csv('Data/final_standings.csv')
points_df.to_csv('Data/final_points.csv')

## Results from last 5 matches

In [21]:
df.head()

Unnamed: 0,Season,Round,Home,Away,HomeGoals,AwayGoals,Result,HomePoints,AwayPoints,GoalsScored,HomeAdvantage
0,1959-1960,1,ADO Den Haag,Willem II,3,3,3,1,1,6,0
1,1959-1960,1,AFC Ajax,NAC Breda,3,0,1,3,0,3,3
2,1959-1960,1,Blauw Wit,Sittardia,5,2,1,3,0,7,3
3,1959-1960,1,USV Elinkwijk,MVV Maastricht,2,0,1,3,0,2,2
4,1959-1960,1,Feijenoord,Sparta Rotterdam,0,1,2,0,3,1,-1


In [99]:
def get_form(df, team, window, outputcol):
    teamdf = df.loc[(df.Home==team) | (df.Away==team)]
    pointslist = []
    for index, row in df.iterrows():
        if row.Home==team:
            pointslist.append(row.HomePoints)
        elif row.Away==team:
            pointslist.append(row.AwayPoints)
    teamdf.loc[:,'TeamPoints'] = pointslist
    teamdf.loc[:,outputcol] = teamdf.TeamPoints.rolling(window=window).sum()
    df.loc[teamdf[teamdf.Home==team].index,'Home_'+outputcol] = teamdf.loc[:,outputcol]
    df.loc[teamdf[teamdf.Away==team].index,'Away_'+outputcol] = teamdf.loc[:,outputcol]

In [100]:
get_form(df, 'AFC Ajax', 5, 'RecentForm')

In [93]:
teamlist = df.Home.unique()

In [102]:
%%time
for team in teamlist:
    get_form(df, team, 5, 'RecentForm')

Wall time: 1min 38s


In [115]:
df = df.drop('RecentForm',axis=1)

In [116]:
df.to_csv('Data/matcheswithform.csv')