In [3]:
import pandas as pd
import numpy as np

In [64]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [40]:
## Features to be used

# - Fatigue
#   Days since last match
#   Distance covered in the last x matches?

# - Home Team Form
#   Goals difference of home team in the last x matches    
#   Goals difference of home team in the last x home matches    
#   Average number of points gained by home team in the last x matches
#   Number of home matches won by home team in its last x home matches
#   Home Team Win streak  
#   Home Team Newly Promoted Team?

# - Away Team Form
#   Goals difference of away team in the last x matches  
#   Goals difference of away team in the last x away matches
#   Average number of points gained by away team in the last x matches
#   Number of away matches won by away team in its last x away matches
#   Away Team Win streak
#   Away Team Newly Promoted Team?

# - Home Team Performance Index
#   Home Defense Performance Index
#   Home Midfield Performance Index
#   Home Attack Performance Index

# - Away Team Performance Index
#   Away Defense Performance Index
#   Away Midfield Performance Index
#   Away Attack Performance Index

# - Betting Odds
#     B365H
#     B365D
#     B365A

# - Head to Head
#   Goals difference in previous x encounters

## Data Preparation

In [36]:
df = pd.DataFrame()

In [38]:
# read csv file for match statistics
match_statistics_17_18_df = pd.read_csv('datasets/2017-2018/2017-2018-match-statistics.csv')

# drop irrelvant columns
match_statistics_17_18_df.drop(match_statistics_17_18_df.loc[:, 'BWH':].columns, axis=1, inplace = True)
match_statistics_17_18_df.drop(['Div', 'HTHG', 'HTAG', 'HTR', 'Referee'], axis=1, inplace = True)

match_statistics_17_18_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,11/08/2017,Arsenal,Leicester,4,3,H,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5
1,12/08/2017,Brighton,Man City,0,2,A,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33
2,12/08/2017,Chelsea,Burnley,2,3,A,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0
3,12/08/2017,Crystal Palace,Huddersfield,0,3,A,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0
4,12/08/2017,Everton,Stoke,1,0,H,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75


In [86]:
# read csv file for each team
arsenal_fixtures_17_18_df = pd.read_csv('datasets/2017-2018/arsenal-fixtures.csv')
arsenal_fixtures_17_18_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2017-08-06,14:00 (21:00),Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Chelsea,,,,83325,Per Mertesacker,3-4-3,Robert Madley,Match Report,
1,2017-08-11,19:45 (02:45),Premier League,Matchweek 1,Fri,Home,W,4,3,Leicester City,2.5,1.5,68.0,59387,Petr Čech,3-4-3,Mike Dean,Match Report,
2,2017-08-19,17:30 (00:30),Premier League,Matchweek 2,Sat,Away,L,0,1,Stoke City,1.5,0.7,76.0,29459,Petr Čech,3-4-3,Andre Marriner,Match Report,
3,2017-08-27,16:00 (23:00),Premier League,Matchweek 3,Sun,Away,L,0,4,Liverpool,0.6,3.1,52.0,53206,Laurent Koscielny,3-4-3,Craig Pawson,Match Report,
4,2017-09-09,15:00 (22:00),Premier League,Matchweek 4,Sat,Home,W,3,0,Bournemouth,2.2,0.6,58.0,59262,Laurent Koscielny,3-4-3,Anthony Taylor,Match Report,


In [87]:
# drop irrelvant columns
arsenal_fixtures_17_18_df.drop(['Time', 'Day', 'Attendance', 'Captain', 'Formation', 'Referee', 'Match Report', 'Notes'], axis=1, inplace = True)
arsenal_fixtures_17_18_df.head()

Unnamed: 0,Date,Comp,Round,Venue,Result,GF,GA,Opponent,xG,xGA,Poss
0,2017-08-06,Community Shield,FA Community Shield,Neutral,D,1 (4),1 (1),Chelsea,,,
1,2017-08-11,Premier League,Matchweek 1,Home,W,4,3,Leicester City,2.5,1.5,68.0
2,2017-08-19,Premier League,Matchweek 2,Away,L,0,1,Stoke City,1.5,0.7,76.0
3,2017-08-27,Premier League,Matchweek 3,Away,L,0,4,Liverpool,0.6,3.1,52.0
4,2017-09-09,Premier League,Matchweek 4,Home,W,3,0,Bournemouth,2.2,0.6,58.0


In [88]:
# add new feature: 'DaysLastPlayed'

arsenal_fixtures_17_18_df['Date'] =  pd.to_datetime(arsenal_fixtures_17_18_df['Date'])
arsenal_fixtures_17_18_df['DaysLastPlayed'] = arsenal_fixtures_17_18_df['Date'] - arsenal_fixtures_17_18_df['Date'].shift(1)
arsenal_fixtures_17_18_df.head(5)

Unnamed: 0,Date,Comp,Round,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,DaysLastPlayed
0,2017-08-06,Community Shield,FA Community Shield,Neutral,D,1 (4),1 (1),Chelsea,,,,NaT
1,2017-08-11,Premier League,Matchweek 1,Home,W,4,3,Leicester City,2.5,1.5,68.0,5 days
2,2017-08-19,Premier League,Matchweek 2,Away,L,0,1,Stoke City,1.5,0.7,76.0,8 days
3,2017-08-27,Premier League,Matchweek 3,Away,L,0,4,Liverpool,0.6,3.1,52.0,8 days
4,2017-09-09,Premier League,Matchweek 4,Home,W,3,0,Bournemouth,2.2,0.6,58.0,13 days


In [89]:
# filter by Premier League home matches only
arsenal_fixtures_17_18_df = arsenal_fixtures_17_18_df[arsenal_fixtures_17_18_df['Comp'] == 'Premier League']
arsenal_fixtures_17_18_df = arsenal_fixtures_17_18_df[arsenal_fixtures_17_18_df['Venue'] == 'Home']
arsenal_fixtures_17_18_df.head()

Unnamed: 0,Date,Comp,Round,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,DaysLastPlayed
1,2017-08-11,Premier League,Matchweek 1,Home,W,4,3,Leicester City,2.5,1.5,68.0,5 days
4,2017-09-09,Premier League,Matchweek 4,Home,W,3,0,Bournemouth,2.2,0.6,58.0,13 days
8,2017-09-25,Premier League,Matchweek 6,Home,W,2,0,West Brom,2.2,0.9,69.0,5 days
10,2017-10-01,Premier League,Matchweek 7,Home,W,2,0,Brighton,2.4,0.4,64.0,3 days
15,2017-10-28,Premier League,Matchweek 10,Home,W,2,1,Swansea City,2.0,0.9,72.0,4 days


In [90]:
# change feature name: {'xG': 'HxG', 'xGA': 'AxG', 'Poss': 'HPoss', 'Opponent': 'AwayTeam'}
arsenal_fixtures_17_18_df = arsenal_fixtures_17_18_df.rename(columns={'xG': 'HxG', 'xGA': 'AxG', 'Poss': 'HPoss', 'Opponent': 'AwayTeam'})
arsenal_fixtures_17_18_df.head()

Unnamed: 0,Date,Comp,Round,Venue,Result,GF,GA,AwayTeam,HxG,AxG,HPoss,DaysLastPlayed
1,2017-08-11,Premier League,Matchweek 1,Home,W,4,3,Leicester City,2.5,1.5,68.0,5 days
4,2017-09-09,Premier League,Matchweek 4,Home,W,3,0,Bournemouth,2.2,0.6,58.0,13 days
8,2017-09-25,Premier League,Matchweek 6,Home,W,2,0,West Brom,2.2,0.9,69.0,5 days
10,2017-10-01,Premier League,Matchweek 7,Home,W,2,0,Brighton,2.4,0.4,64.0,3 days
15,2017-10-28,Premier League,Matchweek 10,Home,W,2,1,Swansea City,2.0,0.9,72.0,4 days


In [92]:
# add new feature: 'HomeTeam'
arsenal_fixtures_17_18_df['HomeTeam'] = 'Arsenal'
arsenal_fixtures_17_18_df.head()

Unnamed: 0,Date,Comp,Round,Venue,Result,GF,GA,AwayTeam,HxG,AxG,HPoss,DaysLastPlayed,HomeTeam
1,2017-08-11,Premier League,Matchweek 1,Home,W,4,3,Leicester City,2.5,1.5,68.0,5 days,Arsenal
4,2017-09-09,Premier League,Matchweek 4,Home,W,3,0,Bournemouth,2.2,0.6,58.0,13 days,Arsenal
8,2017-09-25,Premier League,Matchweek 6,Home,W,2,0,West Brom,2.2,0.9,69.0,5 days,Arsenal
10,2017-10-01,Premier League,Matchweek 7,Home,W,2,0,Brighton,2.4,0.4,64.0,3 days,Arsenal
15,2017-10-28,Premier League,Matchweek 10,Home,W,2,1,Swansea City,2.0,0.9,72.0,4 days,Arsenal


In [94]:
# add new feature: 'APoss'
arsenal_fixtures_17_18_df['APoss'] = 100 - arsenal_fixtures_17_18_df['HPoss']
arsenal_fixtures_17_18_df.head()

Unnamed: 0,Date,Comp,Round,Venue,Result,GF,GA,AwayTeam,HxG,AxG,HPoss,DaysLastPlayed,HomeTeam,APoss
1,2017-08-11,Premier League,Matchweek 1,Home,W,4,3,Leicester City,2.5,1.5,68.0,5 days,Arsenal,32.0
4,2017-09-09,Premier League,Matchweek 4,Home,W,3,0,Bournemouth,2.2,0.6,58.0,13 days,Arsenal,42.0
8,2017-09-25,Premier League,Matchweek 6,Home,W,2,0,West Brom,2.2,0.9,69.0,5 days,Arsenal,31.0
10,2017-10-01,Premier League,Matchweek 7,Home,W,2,0,Brighton,2.4,0.4,64.0,3 days,Arsenal,36.0
15,2017-10-28,Premier League,Matchweek 10,Home,W,2,1,Swansea City,2.0,0.9,72.0,4 days,Arsenal,28.0
