# Import Libraries

In [1]:
import pandas as pd
import numpy as np

import glob
import os

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Data Preparation

There are two main data sources for this project: 
- www.football-data.co.uk 
- www.fbref.com/en. 

Both of these two data sources provide match statistics data for Premier League matches. We will extract match statistics from both of these websites and integrate them into one DataFrame.

First we create an empty DataFrame that will eventually contain the data integrated from the various data sources.

## Data Source 1

www.football-data-co.uk

In [3]:
datasource1_df = pd.DataFrame()
seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

for season in seasons:
    # read csv file for match statistics
    temp_df = pd.read_csv(f'datasets/{season}/data-source-1/{season}-match-statistics.csv')

    # drop irrelvant columns
    temp_df.drop(temp_df.loc[:, 'BWH':].columns, axis=1, inplace = True)
    temp_df.drop(['Div', 'HTHG', 'HTAG', 'HTR', 'Referee', 'Time'], axis=1, inplace = True, errors='ignore')
    
    temp_df.to_csv(f'datasets/{season}/dataset1.csv', index=False)

## Data Source 2

www.fbref.com/en

In [4]:
seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022']

for season in seasons:
     
    concatenated_df = pd.DataFrame()
        
    # get all the datasets from a season in data-source-2 folder
    path = f'datasets/{season}/data-source-2'
    csv_files = glob.glob(os.path.join(path, "*.csv"))

    for file in csv_files:
        
        team = file.split('\\')[1].split('-fixtures')[0]
        
        # read csv file for each team
        temp_df = pd.read_csv(file)
        
        # drop irrelvant columns
        temp_df.drop(['Time', 'Round', 'Day', 'Attendance', 'Captain', 'Formation', 'Referee', 'Match Report', 'Notes'], axis=1, inplace = True)
        
        # add new feature: 'HomeTeam'
        temp_df['team'] = team
                
        # concatenate temp_df to concatenated_df
        if concatenated_df.empty:
            concatenated_df = temp_df
        else:
            concatenated_df = pd.concat([concatenated_df, temp_df]).reset_index(drop=True)
    
    concatenated_df.to_csv(f'datasets/{season}/dataset2.csv', index=False)