In [1]:
# import libaries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import time
import matplotlib.pyplot as plt
from scipy.stats import poisson

### Step 1: Get EPL Team Overall Stats from web
Get EPL teams' stats using database in www.pesmaster.com.
The result dictionary will have 6 seasons' stats from 13/14 ~ 18/19 EPL.
PES 14 to PES 19

In [2]:
# list of the teams (12/13 ~ 18/19)
# data from three different sources use different names for teams. 
# manually making a dictionary to rename teams
team_name = {
            'Arsenal FC': 'Arsenal',
            'Burnley FC': 'Burnley',
            'Liverpool FC': 'Liverpool',
            'Everton FC': 'Everton',
            'Fulham FC': 'Fulham',
            'Southampton FC': 'Southampton',
            'Chelsea FC': 'Chelsea',
            'West Bromwich Albion': 'West Bromwich',
            'Manchester United': 'Manchester Utd',
            'Man United': 'Manchester Utd',
            'Man City': 'Manchester City', 
            'Newcastle': 'Newcastle Utd',
            'Newcastle United': 'Newcastle Utd',
            'West Ham United': 'West Ham Utd', 
            'Tottenham Hotspur': 'Tottenham',
            'Queens Park Rangers': 'QP Rangers',
            'Watford FC': 'Watford',
            'AFC Bournemouth': 'Bournemouth',
            'Sunderland AFC': 'Sunderland',
            'Middlesbrough FC': 'Middlesbrough',
            'Brighton & Hove Albion': 'Brighton & Hove',
            'Brighton': 'Brighton & Hove',
            'Huddersfield Town': 'Huddersfield',
            'QPR': 'QP Rangers',
            'Cardiff': 'Cardiff City',
            'Hull': 'Hull City',
            'Norwich': 'Norwich City',
            'Stoke': 'Stoke City',
            'Swansea': 'Swansea City',
            'West Brom': 'West Bromwich',
            'West Ham': 'West Ham Utd',
            'Reading FC': 'Reading',
            'Leicester': 'Leicester City',
            'Wolverhampton Wanderers': 'Wolves',
            'Wigan Athletic': 'Wigan',
            }

In [3]:
teams_seasons = []
for year in range(2014, 2020):
    url = 'https://www.pesmaster.com/english-league/pes-' + str(year) + '/league/9/'
    res = requests.get(url, headers={'User-agent': 'slsl'})
    
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find("table", {"id" : "search-result-table"})
    tr = table.find_all('tr')
    
    teams = []

    for i in range(1,(len(tr))):
            result = {}

            td = tr[i].find_all('td')
            
            if td:
                result['Team'] = td[0].text
                result['Ovr'] = td[1].text
                result['Def'] = td[2].text
                result['Mid'] = td[3].text
                result['Fwd'] = td[4].text
                result['Phy'] = td[5].text
                result['Spd'] = (td[6].text)
                result['Season'] = year - 2000

            if len(result) == 8:
                teams.append(result)
                
    time.sleep(3)
    
    teams_df = pd.DataFrame(teams, columns=(['Season', 'Team', 'Ovr', 'Def', 'Mid', 'Fwd', 'Phy', 'Spd']))
    teams_df = teams_df.set_index('Team')
    teams_df.rename(index=team_name, inplace=True)
    teams_df.sort_index(inplace=True)

    # put all seasons into a list
    teams_seasons.append(teams_df)

In [4]:
# Merge all PES data in one dataframe
pes_data = teams_seasons[0].append([teams_seasons[1], teams_seasons[2], teams_seasons[3], teams_seasons[4], teams_seasons[5]])

In [5]:
pes_data.to_csv('./Data/pes_data.csv')

### Step 2: Get EPL Tables of Last 5 Seasons (2012/13~2017/18)

In [6]:
season_tables = []

for year in range(2013, 2019):
    # load data from soccerstats.com by years (2013 will mean 12/13)
    table_url = 'https://www.soccerstats.com/widetable.asp?league=england_' + str(year)
    res = requests.get(table_url, headers={'User-agent': 'slsl'})

    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    
    # get a table
    table = soup.find('table', {'id': 'btable'})
    tr = table.find_all('tr', {'class': 'trow8'})

    final_table = []

    for i in range(0,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Rank'] = int(td[0].text)        # final rank
            result['Team'] = td[1].text.strip()     # team name
            result['P'] = int(td[2].text)           # number of games
            result['W'] = int(td[3].text)           # wins
            result['D'] = int(td[4].text)           # draws
            result['L'] = int(td[5].text)           # loses
            result['HW'] = int(td[13].text)         # home wins
            result['HGF'] = int(td[16].text)        # home goals scored
            result['HGA'] = int(td[17].text)        # home goals conceded
            result['AGF'] = int(td[27].text)        # away goals scored
            result['AGA'] = int(td[28].text)        # away goals conceded
            result['AW'] = int(td[24].text)         # away wins
            result['GF'] = int(td[6].text)          # goals for
            result['GA'] = int(td[7].text)          # goals allowed
            result['GD'] = int(td[8].text)          # goal difference
            result['PTS'] = int(td[9].text)         # total points end of the season
            result['Season'] = year - 2000

            # put them into one dictionary
            if len(result) == 17:
                final_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    final_table = pd.DataFrame(final_table, columns=(['Season', 'Rank', 'Team', 'P', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'HW', 'HGF', 'HGA', 'AW', 'AGF', 'AGA', 'PTS']))
    
    # Create features to generate poisson distribution
    final_table['H_Att'] = final_table['HGF'] / 19 / (final_table['HGF'].sum() / 380)
    final_table['A_Att'] = final_table['AGF'] / 19 / (final_table['AGF'].sum() / 380)
    final_table['H_Def'] = final_table['HGA'] / 19 / (final_table['HGA'].sum() / 380)
    final_table['A_Def'] = final_table['AGA'] / 19 / (final_table['AGA'].sum() / 380) 
    
    # change index as a team names
    final_table = final_table.set_index('Team')
    final_table.rename(index=team_name, inplace=True)
    final_table.sort_index(inplace=True)
    # put all seasons into a list
    season_tables.append(final_table)

In [7]:
# Merge all season tables in one dataframe
epl_season_table = season_tables[0].append([season_tables[1], season_tables[2], season_tables[3], season_tables[4], season_tables[5]])

### Step 3: Getting Cleans Sheets

In [8]:
cs_tables = []
seasons = ['2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18']
for year in seasons:
    #load data from soccerstats.com by years (2013 will mean 13/14)
    table_url = 'https://www.sportsmole.co.uk/football/premier-league/' + year + '/best-defence.html'
    res = requests.get(table_url, headers={'User-agent': 'slee'})
   
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break
        
    soup = BeautifulSoup(res.content, 'lxml')

    # get a table
    table = soup.find('table', {'class':'leaguetable full'})
    tr = table.find_all('tr')
    cs_table = []

    for i in range(1,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[1].text.strip()          # goals allowed
            result['CS'] = int(td[4].text)               # total points end of the season
            result['Season'] = year.split('-')[1]
            # put them into one dictionary
            if len(result) == 3:
                cs_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    cs_table = pd.DataFrame(cs_table, columns=(['Team', 'Season', 'CS']))
    # change index as a team names
    cs_table = cs_table.set_index('Team')
    cs_table.rename(index=team_name, inplace=True)
    cs_table.sort_index(inplace=True)
    # put all seasons into a list
    cs_tables.append(cs_table)

In [9]:
# Merge all seasons' clean sheet data in one dataframe
epl_cs_table = cs_tables[0].append([cs_tables[1], cs_tables[2], cs_tables[3], cs_tables[4], cs_tables[5]])

### Step 4: Getting the Number of Cards

In [10]:
dis_tables = []

for year in range(2012, 2018):
    # load data from transfermarkt.co.uk by years (2012 means 12/13)
    table_url = 'https://www.transfermarkt.co.uk/premier-league/fairnesstabelle/wettbewerb/GB1/saison_id/' + str(year) + '/plus/1'
    res = requests.get(table_url, headers={'User-agent': 'slsl'})

    if res.status_code != 200:
        print('Status not 200', res.status_code)

    soup = BeautifulSoup(res.content, 'lxml')

    # get a table
    table = soup.find('table', {'class': 'items'})
    tbody = table.find('tbody')
    tr = tbody.find_all('tr')

    dis_table = []

    for i in range(0,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[2].text.strip()     # team name
            result['YC'] = int(td[5].text)          # total number of yellow cards during the season
            result['RC'] = int(td[8].text)          # red cards
            result['Season'] = year - 1999
            # put them into one dictionary
            if len(result) == 4:
                dis_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    dis_table = pd.DataFrame(dis_table, columns=(['Team', 'Season', 'YC', 'RC']))
    # change index as a team names
    dis_table = dis_table.set_index('Team')
    dis_table.rename(index=team_name, inplace=True)
    dis_table.sort_index(inplace=True)
    # put all seasons into a list
    dis_tables.append(dis_table)

In [11]:
# Merge all seasons' cards data in one dataframe
epl_dis_table = dis_tables[0].append([dis_tables[1], dis_tables[2], dis_tables[3], dis_tables[4], dis_tables[5]])

### Step 5: Concatenate All Data Frame

In [12]:
epl_data = pd.concat([epl_season_table, epl_cs_table['CS'], epl_dis_table[['YC', 'RC']]], axis=1)

In [13]:
epl_data.to_csv('./Data/epl_data.csv')

### Step 6: EPL Fixture (Result) Data
Saves all games during each season from 12/13 to 17/18, and 18/19 (present)

In [14]:
seasons = ['1314', '1415', '1516', '1617', '1718', '1819']
fixture = []
for season in seasons:
    fix = pd.read_csv('http://www.football-data.co.uk/mmz4281/' + season + '/E0.csv').replace(team_name)[['HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG']]
    fix['Season'] = int(season[2:])
    fixture.append(fix)

In [15]:
# Merge all seasons' cards data in one dataframe
epl_fixture = fixture[0].append([fixture[1], fixture[2], fixture[3], fixture[4], fixture[5]]).dropna()

In [16]:
epl_fixture.to_csv('./Data/epl_fixture.csv')