# Import Libraries

In [1]:
import pandas as pd
import numpy as np

import glob
import os

import requests
from bs4 import BeautifulSoup

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Data Preparation

There are two main data sources for this project: 
- www.football-data.co.uk 
- www.fbref.com/en. 

Both of these two data sources provide match statistics data for Premier League matches needed to predict football matches. 

The relevant datasets for 5 seasons ('2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022') of English Premier League have been collected form these two data sources.

In [3]:
seasons = ['2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', 
           '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

## Data Source 1

www.football-data-co.uk

Data Source 1 provides one dataset for each season. 

This dataset contains match statistics information for each English Premier League match played during the season, but it also contains a lot of irrelevant betting odds information for this research. Only 3 betting odds features will be kept ('B365H', 'B365D', 'B365A'), while the rest will be dropped. Some match statistics features that are deemed useless to this research will be dropped from the dataset as well.

In [4]:
for season in seasons:
    # read csv file for match statistics
    temp_df = pd.read_csv(f'data/raw/data-source-1/{season}-match-statistics.csv')

    # drop irrelvant columns
    temp_df.drop(temp_df.loc[:, 'BWH':].columns, axis=1, inplace = True)
    temp_df.drop(['Div', 'HTHG', 'HTAG', 'HTR', 'Referee', 'Time'], axis=1, inplace = True, errors='ignore')
    
    temp_df.to_csv(f'data/processed/data-source-1/{season}-data-source-1.csv', index=False)

A sneak peek at how Dataset 1 looks like for the 2017-2018 EPL season:

In [5]:
pd.read_csv('data/processed/data-source-1/2017-2018-data-source-1.csv').head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A
0,11/08/2017,Arsenal,Leicester,4,3,H,27,6,10,3,9,12,9,4,0,1,0,0,1.53,4.5,6.5
1,12/08/2017,Brighton,Man City,0,2,A,6,14,2,4,6,9,3,10,0,2,0,0,11.0,5.5,1.33
2,12/08/2017,Chelsea,Burnley,2,3,A,19,10,6,5,16,11,8,5,3,3,2,0,1.25,6.5,15.0
3,12/08/2017,Crystal Palace,Huddersfield,0,3,A,14,8,4,6,7,19,12,9,1,3,0,0,1.83,3.6,5.0
4,12/08/2017,Everton,Stoke,1,0,H,9,9,4,1,13,10,6,7,1,1,0,0,1.7,3.8,5.75


## Data Source 2

www.fbref.com/en

Unlike Data Source 1, Data Source 2 provides one match statistics dataset for EVERY team in a season. This means that there are 20 datasets containing match statistics (1 for each team) for each season. 

Each dataset contains match statistics for every match played by the team during the season (not just English Premier League match). Each dataset also contains additional match statistics that are not provided by Data Source 1 like Expected Goals (xG) and Possession. Match statistics features that are deemed useless to this research will be dropped from the dataset.

We will also do concatenation of these datasets for each season, so that we end up with 1 dataset for each season instead of 20 datasets for each season.

In [6]:
for season in seasons:
     
    concatenated_df = pd.DataFrame()
        
    # get all the datasets from a season in data-source-2 folder
    path = f'data/raw/data-source-2/{season}'
    csv_files = glob.glob(os.path.join(path, "*.csv"))

    for file in csv_files:
        
        team = file.split('\\')[1].split('-fixtures')[0]
        
        # read csv file for each team
        temp_df = pd.read_csv(file)
        
        # drop irrelvant columns
        temp_df.drop(['Time', 'Round', 'Day', 'Attendance', 'Captain', 'Formation', 'Referee', 'Match Report', 'Notes'], axis=1, inplace = True)
        
        # add new feature: 'HomeTeam'
        temp_df['team'] = team
                
        # concatenate temp_df to concatenated_df
        if concatenated_df.empty:
            concatenated_df = temp_df
        else:
            concatenated_df = pd.concat([concatenated_df, temp_df]).reset_index(drop=True)
    
    concatenated_df.to_csv(f'data/processed/data-source-2/{season}-data-source-2.csv', index=False)

A sneak peek at how Dataset 2 (dataset2.csv) looks like for the 2017-2018 EPL season:

In [7]:
pd.read_csv('data/processed/data-source-2/2017-2018-data-source-2.csv').head()

Unnamed: 0,Date,Comp,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,team
0,2017-08-06,Community Shield,Neutral,D,1 (4),1 (1),Chelsea,,,,arsenal
1,2017-08-11,Premier League,Home,W,4,3,Leicester City,2.5,1.5,68.0,arsenal
2,2017-08-19,Premier League,Away,L,0,1,Stoke City,1.5,0.7,76.0,arsenal
3,2017-08-27,Premier League,Away,L,0,4,Liverpool,0.6,3.1,52.0,arsenal
4,2017-09-09,Premier League,Home,W,3,0,Bournemouth,2.2,0.6,58.0,arsenal


## Data Source 3

www.fifaindex.com

Data Source 3 provides FIFA team ratings and players ratings for every football team each year. We are mainly interested in collecting data for the team ratings. For every team, we will collect the Defense rating, Midfield rating and Attack rating.

We will do web scraping to get the ratings for every team that is relevant to this research.

In [8]:
def getTeamsData(soup):
    rows = []

    version = soup.findAll('a', attrs = {'class':"dropdown-toggle"})[-2].text.strip()
    date = soup.findAll('a', attrs = {'class':"dropdown-toggle"})[-1].text.strip()
    name_rows = soup.findAll('td', attrs = {'data-title':"Name"})
    att_rows = soup.findAll('td', attrs = {'data-title':"ATT"})
    mid_rows = soup.findAll('td', attrs = {'data-title':"MID"})
    def_rows = soup.findAll('td', attrs = {'data-title':"DEF"})

    for i in range(len(name_rows)):
        name = name_rows[i].text
        attack = att_rows[i].text
        midfield = mid_rows[i].text
        defense = def_rows[i].text

        rows.append([version, date, name, attack, midfield, defense])

    return rows

In [9]:
data = []

In [10]:
start_page = 555

while True:
    url = f'https://www.fifaindex.com/teams/fifa22_{start_page}/?league=13&order=desc'
    response = requests.get(url)
    
    if response.status_code == 404:
        start_page -= 1
        continue
    
    soup = BeautifulSoup(response.content)
    version = soup.findAll('a', attrs = {'class':"dropdown-toggle"})[-2].text.strip()

    data.extend(getTeamsData(soup))
    start_page -= 1
        
    if start_page == 0:
        break

In [11]:
start_page = 555

while True:
    url = f'https://www.fifaindex.com/teams/fifa22_{start_page}/?league=14&order=desc'
    response = requests.get(url)
    
    if response.status_code == 404:
        start_page -= 1
        continue
    
    soup = BeautifulSoup(response.content)
    version = soup.findAll('a', attrs = {'class':"dropdown-toggle"})[-2].text.strip()

    data.extend(getTeamsData(soup))
    start_page -= 1
    
    if start_page == 0:
        break

In [12]:
datasource3_df = pd.DataFrame(data, columns= ['Version', 'Date', 'Team', 'Attack', 'Midfield', 'Defense'])
datasource3_df.to_csv(f'data/raw/data-source-3/team-ratings.csv', index=False)

In [14]:
# datasource3_df.tail()