# Import Libraries

In [1]:
import pandas as pd
import numpy as np

import glob
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Features to be used

**Fatigue**
- Days since last match
- Distance covered in the last x matches?

**Home Team Form**
- Goals difference of home team in the last x matches    
- Goals difference of home team in the last x home matches    
- Average number of points gained by home team in the last x matches
- Number of home matches won by home team in its last x home matches
- Home Team Win streak  
- Home Team Newly Promoted Team?

**Away Team Form**
- Goals difference of away team in the last x matches  
- Goals difference of away team in the last x away matches
- Average number of points gained by away team in the last x matches
- Number of away matches won by away team in its last x away matches
- Away Team Win streak
- Away Team Newly Promoted Team?

**Home Team Performance Index**
- Home Defense Performance Index
- Home Midfield Performance Index
- Home Attack Performance Index

**Away Team Performance Index**
- Away Defense Performance Index
- Away Midfield Performance Index
- Away Attack Performance Index

**Betting Odds**
- B365H
- B365D
- B365A

# Data Preparation

There are two main data sources for this project: 
- www.football-data.co.uk 
- www.fbref.com/en. 

Both of these two data sources provide match statistics data for Premier League matches. We will extract match statistics from both of these websites and integrate them into one DataFrame.

First we create an empty DataFrame that will eventually contain the data integrated from the various data sources.

In [3]:
df = pd.DataFrame()

## Data Source 1

www.football-data-co.uk

In [4]:
datasource1_df = pd.DataFrame()
seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

for season in seasons:
    # read csv file for match statistics
    temp_df = pd.read_csv(f'datasets/{season}/data-source-1/{season}-match-statistics.csv')

    # drop irrelvant columns
    temp_df.drop(temp_df.loc[:, 'BWH':].columns, axis=1, inplace = True)
    temp_df.drop(['Div', 'HTHG', 'HTAG', 'HTR', 'Referee', 'Time'], axis=1, inplace = True, errors='ignore')
    
    # convert 'Date' column to datetime object
    temp_df['Date'] =  pd.to_datetime(temp_df['Date'], format="%d/%m/%Y")
    
    # concatenate temp_df to datasource1_df
    if datasource1_df.empty:
        datasource1_df = temp_df
    else:
        datasource1_df = pd.concat([datasource1_df, temp_df]).reset_index(drop=True)

In [5]:
# Make sure we have 5 x 380 = 1900 matches in the DataFrame
datasource1_df.shape

(1900, 21)

In [6]:
# standardize the teams names across all datasets

rename_teams = {'Arsenal': 'arsenal', 'Brighton': 'brighton', 'Chelsea': 'chelsea', 'Crystal Palace': 'palace', 'Everton': 'everton', 
                'Southampton': 'southampton', 'Watford': 'watford', 'West Brom': 'west-brom', 'Man United': 'united', 'Newcastle': 'newcastle',
                'Bournemouth': 'bournemouth', 'Burnley': 'burnley', 'Leicester': 'leicester', 'Liverpool': 'liverpool', 'Stoke': 'stoke',
                'Swansea': 'swansea', 'Huddersfield': 'huddersfield', 'Tottenham': 'tottenham', 'Man City': 'city', 'West Ham': 'west-ham',
                'Fulham': 'fulham', 'Wolves': 'wolves', 'Cardiff': 'cardiff', 'Aston Villa': 'aston-villa', 'Norwich': 'norwich',
                'Sheffield United': 'sheffield', 'Leeds': 'leeds', 'Brentford':'brentford'}

datasource1_df['HomeTeam'] = datasource1_df['HomeTeam'].apply(lambda word : rename_teams[word])
datasource1_df['AwayTeam'] = datasource1_df['AwayTeam'].apply(lambda word : rename_teams[word])

In [7]:
datasource1_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,2017-08-11,arsenal,leicester,4,3,H,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5
1,2017-08-12,brighton,city,0,2,A,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33
2,2017-08-12,chelsea,burnley,2,3,A,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0
3,2017-08-12,palace,huddersfield,0,3,A,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0
4,2017-08-12,everton,stoke,1,0,H,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75


## Data Source 2
www.fbref.com/en

In [8]:
# standardize the teams names across all datasets

rename_teams = {'Leicester City':'leicester', 'Bournemouth':'bournemouth', 'West Brom':'west-brom', 'Brighton': 'brighton', 'Swansea City':'swansea', 
                'Tottenham':'tottenham', 'Huddersfield':'huddersfield', 'Manchester Utd': 'united', 'Newcastle Utd':'newcastle', 'Liverpool':'liverpool', 
                'Chelsea': 'chelsea', 'Crystal Palace': 'palace', 'Everton': 'everton', 'Manchester City':'city', 'Watford': 'watford', 
                'Stoke City':'stoke', 'Southampton': 'southampton', 'West Ham':'west-ham', 'Burnley':'burnley', 'Arsenal': 'arsenal', 
                'Wolves':'wolves', 'Fulham':'fulham', 'Cardiff City':'cardiff', 'Aston Villa':'aston-villa', 'Sheffield Utd':'sheffield',
                'Norwich City':'norwich', 'Leeds United':'leeds', 'Brentford':'brentford'}

In [9]:
def getHDaysLastPlayed(row):
    HDaysLastPlayed = str(row['DaysLastPlayed']).split()[0]
    return HDaysLastPlayed

In [10]:
def getADaysLastPlayed(row):
    
    date = row['Date']
    team = row['team']
    opponent = row['AwayTeam']
    
    filter_condition = (concatenated_df['Date'] == date) & (concatenated_df['team'] == opponent) & (concatenated_df['AwayTeam'] == team)
    ADaysLastPlayed = str(concatenated_df[filter_condition]['DaysLastPlayed']).split()[1]    
    return ADaysLastPlayed

In [None]:
datasource2_df = pd.DataFrame()
seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

for season in seasons:
    
    concatenated_df = pd.DataFrame()
    
    # get all the datasets from data-source-2 folder
    path = f'datasets/{season}/data-source-2'
    csv_files = glob.glob(os.path.join(path, "*.csv"))

    for file in csv_files:
        team = file.split('\\')[1].split('-fixtures')[0]
        
        # read csv file for each team
        temp_df = pd.read_csv(file)

        # drop irrelvant columns
        temp_df.drop(['Time', 'Day', 'Attendance', 'Captain', 'Formation', 'Referee', 'Match Report', 'Notes', 'Round'], axis=1, inplace = True)

        # convert 'Date' column to datetime object
        temp_df['Date'] =  pd.to_datetime(temp_df['Date'], format="%Y/%m/%d")

        # rename features
        temp_df = temp_df.rename(columns={'xG': 'HxG', 'xGA': 'AxG', 'Poss': 'HPoss', 'Opponent': 'AwayTeam'})

        # add new feature: 'HomeTeam'
        temp_df['team'] = team
        
        # concatenate temp_df to concatenated_df
        if concatenated_df.empty:
            concatenated_df = temp_df
        else:
            concatenated_df = pd.concat([concatenated_df, temp_df]).reset_index(drop=True)
        
    # add two new features here: 'HDaysLastPlayed' and 'ADaysLastPlayed'

    # get DaysLastPlayed for all matches
    concatenated_df['DaysLastPlayed'] = concatenated_df['Date'] - concatenated_df['Date'].shift(1)

    # filter by Premier League matches only
    concatenated_df = concatenated_df[concatenated_df['Comp'] == 'Premier League']

    # rename team names in the 'AwayTeam' column for standardized team names
    concatenated_df['AwayTeam'] = concatenated_df['AwayTeam'].apply(lambda word : rename_teams[word])

    concatenated_df['HDaysLastPlayed'] = concatenated_df.apply(lambda row: getHDaysLastPlayed(row), axis=1)
    concatenated_df['ADaysLastPlayed'] = concatenated_df.apply(lambda row: getADaysLastPlayed(row), axis=1)

    # filter by home matches only
    concatenated_df = concatenated_df[concatenated_df['Venue'] == 'Home'].reset_index(drop=True)

    # drop home team column
    concatenated_df.drop(['Venue'], axis=1, inplace = True)

    # add new feature: 'APoss'
    concatenated_df['APoss'] = 100 - concatenated_df['HPoss']

    # rename 'team' to 'HomeTeam'
    concatenated_df = concatenated_df.rename(columns={'team': 'HomeTeam'})

    if datasource2_df.empty:
        datasource2_df = concatenated_df
    else:
        datasource2_df = pd.concat([datasource2_df, concatenated_df]).reset_index(drop=True)

In [None]:
# Make sure we have 5 x 380 = 1900 matches in the DataFrame
datasource2_df.shape

In [None]:
datasource2_df.head()

# Integration of Data Sources 

In [None]:
# df

In [None]:
# merge two data sources into one DataFrame

# df = datasource1_df
# df = pd.merge(df, datasource2_df, on=['Date', 'HomeTeam', 'AwayTeam'])
# df.head()

# Feature Engineering

In [None]:
# add new feature: 'DaysLastPlayed'
# temp_df['DaysLastPlayed'] = temp_df['Date'] - temp_df['Date'].shift(1)