# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 2. Features to be Used

Below are the preliminary features that will be used to build the model. More will be added on afterwards.

**Fatigue**
- Days since last match

**Home Team Form**
- Goals difference of home team in the last x matches    
- Goals difference of home team in the last x home matches    
- Average number of points gained by home team in the last x matches
- Number of home matches won by home team in its last x home matches
- Home Team Win streak  
- Home Team Newly Promoted Team?

**Away Team Form**
- Goals difference of away team in the last x matches  
- Goals difference of away team in the last x away matches
- Average number of points gained by away team in the last x matches
- Number of away matches won by away team in its last x away matches
- Away Team Win streak
- Away Team Newly Promoted Team?

**Home Team Performance Index**
- Home Defense Performance Index
- Home Midfield Performance Index
- Home Attack Performance Index

**Away Team Performance Index**
- Away Defense Performance Index
- Away Midfield Performance Index
- Away Attack Performance Index

**Betting Odds**
- B365H
- B365D
- B365A

# 3. Data Preprocessing

There are two main datasets (Dataset 1 and Dataset 2) for each season that will be used to extract the features needed for the model. 

First we create an empty DataFrame. This DataFrame will eventually contain the data integrated from the two datasets.

In [3]:
df = pd.DataFrame()

## 3.1 Dataset 1

Data Source: www.football-data-co.uk

Firstly we will concatenate every Dataset 1 for every season into a single DataFrame. Then we will do some data preprocessing steps on the DataFrame.

In [4]:
# standardize the teams names across all datasets

rename_teams = {'Arsenal': 'arsenal', 'Brighton': 'brighton', 'Chelsea': 'chelsea', 'Crystal Palace': 'palace', 'Everton': 'everton', 
                'Southampton': 'southampton', 'Watford': 'watford', 'West Brom': 'west-brom', 'Man United': 'united', 'Newcastle': 'newcastle',
                'Bournemouth': 'bournemouth', 'Burnley': 'burnley', 'Leicester': 'leicester', 'Liverpool': 'liverpool', 'Stoke': 'stoke',
                'Swansea': 'swansea', 'Huddersfield': 'huddersfield', 'Tottenham': 'tottenham', 'Man City': 'city', 'West Ham': 'west-ham',
                'Fulham': 'fulham', 'Wolves': 'wolves', 'Cardiff': 'cardiff', 'Aston Villa': 'aston-villa', 'Norwich': 'norwich',
                'Sheffield United': 'sheffield', 'Leeds': 'leeds', 'Brentford':'brentford'}

In [5]:
dataset1_df = pd.DataFrame()
seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

for season in seasons:
    # read csv file for match statistics
    temp_df = pd.read_csv(f'datasets/{season}/dataset1.csv')

    # rename team names in the 'HomeTeam' and 'AwayTeam' columns for standardized team names
    temp_df['HomeTeam'] = temp_df['HomeTeam'].apply(lambda word : rename_teams[word])
    temp_df['AwayTeam'] = temp_df['AwayTeam'].apply(lambda word : rename_teams[word])
    
    # concatenate temp_df to dataset1_df
    if dataset1_df.empty:
        dataset1_df = temp_df
    else:
        dataset1_df = pd.concat([dataset1_df, temp_df]).reset_index(drop=True)

In [6]:
# Make sure we have 5 seasons x 380 matches = 1900 matches in the DataFrame
dataset1_df.shape

(1900, 21)

In [7]:
dataset1_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,11/08/2017,arsenal,leicester,4,3,H,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5
1,12/08/2017,brighton,city,0,2,A,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33
2,12/08/2017,chelsea,burnley,2,3,A,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0
3,12/08/2017,palace,huddersfield,0,3,A,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0
4,12/08/2017,everton,stoke,1,0,H,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75


### 3.1.1 Converting Data Types

In [8]:
dataset1_df.dtypes

Date         object
HomeTeam     object
AwayTeam     object
FTHG          int64
FTAG          int64
FTR          object
HS            int64
AS            int64
HST           int64
AST           int64
HF            int64
AF            int64
HC            int64
AC            int64
HY            int64
AY            int64
HR            int64
AR            int64
B365H       float64
B365D       float64
B365A       float64
dtype: object

In [9]:
# convert 'Date' column to datetime object
dataset1_df['Date'] =  pd.to_datetime(dataset1_df['Date'], format="%d/%m/%Y")
dataset1_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,2017-08-11,arsenal,leicester,4,3,H,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5
1,2017-08-12,brighton,city,0,2,A,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33
2,2017-08-12,chelsea,burnley,2,3,A,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0
3,2017-08-12,palace,huddersfield,0,3,A,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0
4,2017-08-12,everton,stoke,1,0,H,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75


### 3.1.2 Checking for Missing Values

In [10]:
dataset1_df.isnull().sum()

Date        0
HomeTeam    0
AwayTeam    0
FTHG        0
FTAG        0
FTR         0
HS          0
AS          0
HST         0
AST         0
HF          0
AF          0
HC          0
AC          0
HY          0
AY          0
HR          0
AR          0
B365H       0
B365D       0
B365A       0
dtype: int64

### 3.1.3 Data Transformation

In [11]:
dataset1_df['FTR'].unique()

array(['H', 'A', 'D'], dtype=object)

In [12]:
def transform_FTR(word):
    if word == 'H':
        return 'W'
    elif word == 'A':
        return 'L'
    elif word == 'D':
        return 'D'

dataset1_df['FTR'] = dataset1_df['FTR'].apply(lambda x: transform_FTR(x))
dataset1_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,2017-08-11,arsenal,leicester,4,3,W,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5
1,2017-08-12,brighton,city,0,2,L,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33
2,2017-08-12,chelsea,burnley,2,3,L,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0
3,2017-08-12,palace,huddersfield,0,3,L,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0
4,2017-08-12,everton,stoke,1,0,W,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75


## 3.2 Dataset 2

Data Source: www.fbref.com/en

Firstly we will concatenate every Dataset 2 for every season into a single DataFrame. Then we will do some data preprocessing steps on the DataFrame.

In [13]:
# standardize the teams names across all datasets

rename_teams = {'Leicester City':'leicester', 'Bournemouth':'bournemouth', 'West Brom':'west-brom', 'Brighton': 'brighton', 'Swansea City':'swansea', 
                'Tottenham':'tottenham', 'Huddersfield':'huddersfield', 'Manchester Utd': 'united', 'Newcastle Utd':'newcastle', 'Liverpool':'liverpool', 
                'Chelsea': 'chelsea', 'Crystal Palace': 'palace', 'Everton': 'everton', 'Manchester City':'city', 'Watford': 'watford', 
                'Stoke City':'stoke', 'Southampton': 'southampton', 'West Ham':'west-ham', 'Burnley':'burnley', 'Arsenal': 'arsenal', 
                'Wolves':'wolves', 'Fulham':'fulham', 'Cardiff City':'cardiff', 'Aston Villa':'aston-villa', 'Sheffield Utd':'sheffield',
                'Norwich City':'norwich', 'Leeds United':'leeds', 'Brentford':'brentford'}

### 3.2.1 Feature Engineering

While doing data preprocessing for Dataset 2, we will also create a new feature **HDaysLastPlayed** and **ADaysLastPlayed** along the way.

The **HDaysLastPlayed** feature indicates the number of days since the home team's last match.

The **ADaysLastPlayed** feature indicates the number of days since the away team's last match.

Supposedly, we should create new features in the Feature Engineering step later. However, these two features must be created using information obtained from the raw datasets. It is not possible to create these two features after we have concatenated every Dataset 2 of each season into a single dataframe. Therefore, we will create these two features before we concatenate every Dataset 2 of each season into a single dataframe.

In [14]:
# returns the number of days since home team's last match
def getHDaysLastPlayed(row):
    HDaysLastPlayed = str(row['DaysLastPlayed']).split()[0]
    return HDaysLastPlayed

In [15]:
# returns the number of days since away team's last match
def getADaysLastPlayed(row):
    
    date = row['Date']
    team = row['team']
    opponent = row['Opponent']
    
    filter_condition = (concatenated_df['Date'] == date) & (concatenated_df['team'] == opponent) & (concatenated_df['Opponent'] == team)
    ADaysLastPlayed = str(concatenated_df[filter_condition]['DaysLastPlayed']).split()[1]    
    return ADaysLastPlayed

In [16]:
dataset2_df = pd.DataFrame()
seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

for season in seasons:
    
    concatenated_df = pd.read_csv(f'datasets/{season}/dataset2.csv')
  
    # convert 'Date' column to datetime object
    concatenated_df['Date'] =  pd.to_datetime(concatenated_df['Date'], format="%Y/%m/%d")
    
    # get DaysLastPlayed for all matches
    concatenated_df['DaysLastPlayed'] = concatenated_df['Date'] - concatenated_df['Date'].shift(1)
    
    # filter by Premier League matches only
    concatenated_df = concatenated_df[concatenated_df['Comp'] == 'Premier League']
    
    # rename team names in the 'Opponent' column for standardized team names
    concatenated_df['Opponent'] = concatenated_df['Opponent'].apply(lambda word : rename_teams[word])
    
    # add a new feature: HDaysLastPlayed (number of days since home team's last match)
    concatenated_df['HDaysLastPlayed'] = concatenated_df.apply(lambda row: getHDaysLastPlayed(row), axis=1)
    
    # add a new feature: ADaysLastPlayed (number of days since away team's last match)
    concatenated_df['ADaysLastPlayed'] = concatenated_df.apply(lambda row: getADaysLastPlayed(row), axis=1)
    
    # filter by home matches only
    concatenated_df = concatenated_df[concatenated_df['Venue'] == 'Home'].reset_index(drop=True)
    
    # drop 'Comp' and 'Venue' column
    concatenated_df.drop(['Comp', 'Venue', 'DaysLastPlayed'], axis=1, inplace = True)
        
    # rename features
    concatenated_df = concatenated_df.rename(columns={'xG': 'HxG', 'xGA': 'AxG', 'Poss': 'HPoss', 'Opponent': 'AwayTeam', 'team': 'HomeTeam'})
    
    if dataset2_df.empty:
        dataset2_df = concatenated_df
    else:
        dataset2_df = pd.concat([dataset2_df, concatenated_df]).reset_index(drop=True)

In [17]:
# Make sure we have 5 x 380 = 1900 matches in the DataFrame
dataset2_df.shape

(1900, 11)

In [18]:
dataset2_df.head()

Unnamed: 0,Date,Result,GF,GA,AwayTeam,HxG,AxG,HPoss,HomeTeam,HDaysLastPlayed,ADaysLastPlayed
0,2017-08-11,W,4,3,leicester,2.5,1.5,68.0,arsenal,5,-275
1,2017-09-09,W,3,0,bournemouth,2.2,0.6,58.0,arsenal,13,14
2,2017-09-25,W,2,0,west-brom,2.2,0.9,69.0,arsenal,5,5
3,2017-10-01,W,2,0,brighton,2.4,0.4,64.0,arsenal,3,7
4,2017-10-28,W,2,1,swansea,2.0,0.9,72.0,arsenal,4,4


### 3.2.2 Converting Data Types

In [19]:
dataset2_df.dtypes

Date               datetime64[ns]
Result                     object
GF                         object
GA                         object
AwayTeam                   object
HxG                       float64
AxG                       float64
HPoss                     float64
HomeTeam                   object
HDaysLastPlayed            object
ADaysLastPlayed            object
dtype: object

In [20]:
dataset2_df['GF'] = pd.to_numeric(dataset2_df['GF'])
dataset2_df['GA'] = pd.to_numeric(dataset2_df['GA'])
dataset2_df.head()

Unnamed: 0,Date,Result,GF,GA,AwayTeam,HxG,AxG,HPoss,HomeTeam,HDaysLastPlayed,ADaysLastPlayed
0,2017-08-11,W,4,3,leicester,2.5,1.5,68.0,arsenal,5,-275
1,2017-09-09,W,3,0,bournemouth,2.2,0.6,58.0,arsenal,13,14
2,2017-09-25,W,2,0,west-brom,2.2,0.9,69.0,arsenal,5,5
3,2017-10-01,W,2,0,brighton,2.4,0.4,64.0,arsenal,3,7
4,2017-10-28,W,2,1,swansea,2.0,0.9,72.0,arsenal,4,4


### 3.2.3 Checking for Missing Values

In [21]:
dataset2_df.isnull().sum()

Date               0
Result             0
GF                 0
GA                 0
AwayTeam           0
HxG                0
AxG                0
HPoss              0
HomeTeam           0
HDaysLastPlayed    0
ADaysLastPlayed    0
dtype: int64

### 3.2.4 Data Transformation

In [22]:
dataset2_df['HDaysLastPlayed'].unique()

array(['5', '13', '3', '4', '6', '17', '7', '14', '8', '-274', '9', '16',
       '2', '11', '10', '-286', '21', '15', '-280', 'NaT', '18', '-291',
       '22', '-273', '-294', '12', '-295', '100', '-351', '105', '101',
       '-350', '106', '104', '-372', '-251', '-253', '-252', '-246',
       '-255', '-282', '-281'], dtype=object)

In [23]:
dataset2_df['ADaysLastPlayed'].unique()

array(['-275', '14', '5', '7', '4', '13', '3', '8', '15', '6', '-280',
       '16', '2', '9', '11', '-274', '19', '-273', '10', '21', '17',
       '-279', '22', '12', '102', '105', '-351', '18', '107', '-352',
       'NaT', '104', '-371', '-357', '99', '-350', '103', '100', '-251',
       '23', '-253', '-258', '-252', '-250', '20', '-281'], dtype=object)

In [24]:
def transform_DaysLastPlayed(word):
    if word == 'NaT':
        return 7
    
    if int(word) < 0 or int(word) >= 10:
        return 10
    else:
        return int(word)

dataset2_df['HDaysLastPlayed'] = dataset2_df['HDaysLastPlayed'].apply(lambda x: transform_DaysLastPlayed(x))
dataset2_df['ADaysLastPlayed'] = dataset2_df['ADaysLastPlayed'].apply(lambda x: transform_DaysLastPlayed(x))
dataset2_df.head()

Unnamed: 0,Date,Result,GF,GA,AwayTeam,HxG,AxG,HPoss,HomeTeam,HDaysLastPlayed,ADaysLastPlayed
0,2017-08-11,W,4,3,leicester,2.5,1.5,68.0,arsenal,5,10
1,2017-09-09,W,3,0,bournemouth,2.2,0.6,58.0,arsenal,10,10
2,2017-09-25,W,2,0,west-brom,2.2,0.9,69.0,arsenal,5,5
3,2017-10-01,W,2,0,brighton,2.4,0.4,64.0,arsenal,3,7
4,2017-10-28,W,2,1,swansea,2.0,0.9,72.0,arsenal,4,4


## 3.3 Dataset 3

In [25]:
# standardize the teams names across all datasets

rename_teams = {'Liverpool': 'liverpool', 'Manchester City': 'city', 'Chelsea': 'chelsea', 'Manchester United': 'united', 'Tottenham Hotspur': 'tottenham', 
                'Leicester City': 'leicester', 'Arsenal': 'arsenal', 'West Ham United': 'west-ham', 'Aston Villa': 'aston-villa', 'Wolverhampton Wanderers': 'wolves',
                'Everton': 'everton', 'Newcastle United': 'newcastle', 'Leeds United': 'leeds', 'Crystal Palace': 'palace', 'Burnley': 'burnley', 
                'Southampton': 'southampton', 'Brighton & Hove Albion': 'brighton', 'Brentford': 'brentford', 'Watford': 'watford', 'Norwich City': 'norwich', 
                'Fulham': 'fulham', 'West Bromwich Albion': 'west-brom', 'Sheffield United': 'sheffield', 'AFC Bournemouth': 'bournemouth', 'Bournemouth': 'bournemouth', 
                'Huddersfield Town': 'huddersfield', 'Cardiff City': 'cardiff', 'Stoke City': 'stoke', 'Swansea City': 'swansea', 'Sunderland': 'sunderland', 
                'Hull City': 'hull', 'Middlesbrough': 'middlesbrough'}

In [26]:
dataset3_df = pd.read_csv(f'datasets/data-source-3/dataset3.csv')

# remove irrelevant data
dataset3_df = dataset3_df[dataset3_df['Team'].isin(list(rename_teams.keys()))]

# rename team names in the 'Team' column for standardized team names
dataset3_df['Team'] = dataset3_df['Team'].apply(lambda word : rename_teams[word])

dataset3_df.head()

Unnamed: 0,Version,Date,Team,Attack,Midfield,Defense
0,FIFA 22,"Aug. 18, 2022",liverpool,86,84,85
1,FIFA 22,"Aug. 18, 2022",city,84,87,86
2,FIFA 22,"Aug. 18, 2022",chelsea,84,85,83
3,FIFA 22,"Aug. 18, 2022",united,82,83,81
4,FIFA 22,"Aug. 18, 2022",tottenham,83,81,78


In [27]:
dataset3_df.Team.unique()

array(['liverpool', 'city', 'chelsea', 'united', 'tottenham', 'leicester',
       'arsenal', 'west-ham', 'aston-villa', 'wolves', 'everton',
       'newcastle', 'leeds', 'palace', 'burnley', 'southampton',
       'brighton', 'brentford', 'watford', 'norwich', 'fulham',
       'west-brom', 'sheffield', 'bournemouth', 'huddersfield', 'cardiff',
       'stoke', 'swansea', 'sunderland', 'hull', 'middlesbrough'],
      dtype=object)

### 3.3.2 Checking for Missing Values

In [28]:
dataset3_df.isnull().sum()

Version     0
Date        0
Team        0
Attack      0
Midfield    0
Defense     0
dtype: int64

### 3.3.3. Converting Data Types 

In [29]:
dataset3_df.dtypes

Version     object
Date        object
Team        object
Attack       int64
Midfield     int64
Defense      int64
dtype: object

In [30]:
def refactor_date(date):
    months_dict = {'Jan': '01', 'Feb': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 
                   'July': '07', 'Aug': '08', 'Sept': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
    
    date = date.replace(',', '')
    date = date.replace('.', '')
    
    # replace first word of month 
    date = date.replace(date.split()[0], months_dict[date.split()[0]])
    date = date.replace(' ', '/')
    
    return date

In [31]:
dataset3_df['Date'] = dataset3_df['Date'].apply(lambda date: refactor_date(date))
dataset3_df['Date'] = pd.to_datetime(dataset3_df['Date'], format="%m/%d/%Y")
dataset3_df.head()

Unnamed: 0,Version,Date,Team,Attack,Midfield,Defense
0,FIFA 22,2022-08-18,liverpool,86,84,85
1,FIFA 22,2022-08-18,city,84,87,86
2,FIFA 22,2022-08-18,chelsea,84,85,83
3,FIFA 22,2022-08-18,united,82,83,81
4,FIFA 22,2022-08-18,tottenham,83,81,78


In [44]:
dataset3_df.to_csv(f'datasets/data-source-3/cleaned_dataset3.csv', index=False)

# 4. Integration of Datasets

## 4.1 Integrating Dataset 1 and Dataset 2

In [35]:
df

In [36]:
# merge two data sources into one DataFrame

df = dataset1_df
df = pd.merge(df, dataset2_df, on=['Date', 'HomeTeam', 'AwayTeam'])

In [37]:
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,GF,GA,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed
0,2017-08-11,arsenal,leicester,4,3,W,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,4,3,2.5,1.5,68.0,5,10
1,2017-08-12,brighton,city,0,2,L,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33,L,0,2,0.3,1.9,23.0,10,10
2,2017-08-12,chelsea,burnley,2,3,L,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0,L,2,3,1.5,0.6,62.0,6,10
3,2017-08-12,palace,huddersfield,0,3,L,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0,L,0,3,1.1,1.5,56.0,10,10
4,2017-08-12,everton,stoke,1,0,W,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75,W,1,0,0.6,0.4,60.0,9,10


In [38]:
df.shape

(1900, 29)

## 4.2 Dropping Redundant Columns

In [39]:
df.drop(['FTR', 'GF', 'GA'], axis=1, inplace = True)
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,Result,HxG,AxG,HPoss,HDaysLastPlayed,ADaysLastPlayed
0,2017-08-11,arsenal,leicester,4,3,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5,W,2.5,1.5,68.0,5,10
1,2017-08-12,brighton,city,0,2,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33,L,0.3,1.9,23.0,10,10
2,2017-08-12,chelsea,burnley,2,3,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0,L,1.5,0.6,62.0,6,10
3,2017-08-12,palace,huddersfield,0,3,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0,L,1.1,1.5,56.0,10,10
4,2017-08-12,everton,stoke,1,0,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75,W,0.6,0.4,60.0,9,10


In [41]:
df.to_csv('datasets/dataset.csv', index=False)