In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
## Features to be used

# - Fatigue
#   Days since last match
#   Distance covered in the last x matches?

# - Home Team Form
#   Goals difference of home team in the last x matches    
#   Goals difference of home team in the last x home matches    
#   Average number of points gained by home team in the last x matches
#   Number of home matches won by home team in its last x home matches
#   Home Team Win streak  
#   Home Team Newly Promoted Team?

# - Away Team Form
#   Goals difference of away team in the last x matches  
#   Goals difference of away team in the last x away matches
#   Average number of points gained by away team in the last x matches
#   Number of away matches won by away team in its last x away matches
#   Away Team Win streak
#   Away Team Newly Promoted Team?

# - Home Team Performance Index
#   Home Defense Performance Index
#   Home Midfield Performance Index
#   Home Attack Performance Index

# - Away Team Performance Index
#   Away Defense Performance Index
#   Away Midfield Performance Index
#   Away Attack Performance Index

# - Betting Odds
#     B365H
#     B365D
#     B365A

# - Head to Head
#   Goals difference in previous x encounters

## Data Preparation

There are two main data sources for this project: www.football-data.co.uk/ and www.fbref.com/en/. 

Both of these two data sources provide match statistics data for Premier League matches. We will extract match statistics from both of these websites and integrate them into one DataFrame.

First we create an empty DataFrame that will store the integrated data from the various data sources.

In [4]:
df = pd.DataFrame()

### Data Source 1

www.football-data-co.uk

In [5]:
datasource1_df = pd.DataFrame()
seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

for season in seasons:
    # read csv file for match statistics
    temp_df = pd.read_csv(f'datasets/{season}/data-source-1/{season}-match-statistics.csv')

    # drop irrelvant columns
    temp_df.drop(temp_df.loc[:, 'BWH':].columns, axis=1, inplace = True)
    temp_df.drop(['Div', 'HTHG', 'HTAG', 'HTR', 'Referee', 'Time'], axis=1, inplace = True, errors='ignore')
    
    # concatenate df to datasource_1_df
    if datasource1_df.empty:
        datasource1_df = temp_df
    else:
        datasource1_df = pd.concat([datasource1_df, temp_df]).reset_index(drop=True)

In [6]:
# Make sure we have 5 x 380 = 1900 matches in the DataFrame
datasource1_df.shape

(1900, 21)

In [7]:
# standardize the teams names across all datasets

rename_teams = {'Arsenal': 'arsenal', 'Brighton': 'brighton', 'Chelsea': 'chelsea', 'Crystal Palace': 'palace', 'Everton': 'everton', 
                'Southampton': 'southampton', 'Watford': 'watford', 'West Brom': 'west-brom', 'Man United': 'united', 'Newcastle': 'newcastle',
                'Bournemouth': 'bournemouth', 'Burnley': 'burnley', 'Leicester': 'leicester', 'Liverpool': 'liverpool', 'Stoke': 'stoke',
                'Swansea': 'swansea', 'Huddersfield': 'huddersfield', 'Tottenham': 'tottenham', 'Man City': 'city', 'West Ham': 'west-ham',
                'Fulham': 'fulham', 'Wolves': 'wolves', 'Cardiff': 'cardiff', 'Aston Villa': 'aston-villa', 'Norwich': 'norwich',
                'Sheffield United': 'sheffield', 'Leeds': 'leeds', 'Brentford':'brentford'}

datasource1_df['HomeTeam'] = datasource1_df['HomeTeam'].apply(lambda word : rename_teams[word])
datasource1_df['AwayTeam'] = datasource1_df['AwayTeam'].apply(lambda word : rename_teams[word])
datasource1_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,11/08/2017,arsenal,leicester,4,3,H,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5
1,12/08/2017,brighton,city,0,2,A,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33
2,12/08/2017,chelsea,burnley,2,3,A,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0
3,12/08/2017,palace,huddersfield,0,3,A,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0
4,12/08/2017,everton,stoke,1,0,H,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75


### Data Source 2
www.fbref.com/en

In [8]:
# read csv file for each team
arsenal_fixtures_17_18_df = pd.read_csv('datasets/2017-2018/data-source-2/arsenal-fixtures.csv')

# drop irrelvant columns
arsenal_fixtures_17_18_df.drop(['Time', 'Day', 'Attendance', 'Captain', 'Formation', 'Referee', 'Match Report', 'Notes'], axis=1, inplace = True)

# add new feature: 'DaysLastPlayed'
arsenal_fixtures_17_18_df['Date'] =  pd.to_datetime(arsenal_fixtures_17_18_df['Date'])
arsenal_fixtures_17_18_df['DaysLastPlayed'] = arsenal_fixtures_17_18_df['Date'] - arsenal_fixtures_17_18_df['Date'].shift(1)

# filter by Premier League home matches only
arsenal_fixtures_17_18_df = arsenal_fixtures_17_18_df[(arsenal_fixtures_17_18_df['Comp'] == 'Premier League') & (arsenal_fixtures_17_18_df['Venue'] == 'Home')]
arsenal_fixtures_17_18_df = arsenal_fixtures_17_18_df.reset_index(drop=True)

# rename features
arsenal_fixtures_17_18_df = arsenal_fixtures_17_18_df.rename(columns={'xG': 'HxG', 'xGA': 'AxG', 'Poss': 'HPoss', 'Opponent': 'AwayTeam'})

# add new feature: 'HomeTeam'
arsenal_fixtures_17_18_df['HomeTeam'] = 'Arsenal'

# drop home team column
arsenal_fixtures_17_18_df.drop(['Venue'], axis=1, inplace = True)

# add new feature: 'APoss'
arsenal_fixtures_17_18_df['APoss'] = 100 - arsenal_fixtures_17_18_df['HPoss']

arsenal_fixtures_17_18_df.head()

Unnamed: 0,Date,Comp,Round,Result,GF,GA,AwayTeam,HxG,AxG,HPoss,DaysLastPlayed,HomeTeam,APoss
0,2017-08-11,Premier League,Matchweek 1,W,4,3,Leicester City,2.5,1.5,68.0,5 days,Arsenal,32.0
1,2017-09-09,Premier League,Matchweek 4,W,3,0,Bournemouth,2.2,0.6,58.0,13 days,Arsenal,42.0
2,2017-09-25,Premier League,Matchweek 6,W,2,0,West Brom,2.2,0.9,69.0,5 days,Arsenal,31.0
3,2017-10-01,Premier League,Matchweek 7,W,2,0,Brighton,2.4,0.4,64.0,3 days,Arsenal,36.0
4,2017-10-28,Premier League,Matchweek 10,W,2,1,Swansea City,2.0,0.9,72.0,4 days,Arsenal,28.0


In [None]:
# rename teams