In [27]:
# import libaries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import time
import matplotlib.pyplot as plt
from scipy.stats import poisson

### Step 1: Get EPL Team Overall Stats from web
Get EPL teams' stats using database in www.pesmaster.com.
The result dictionary will have 6 seasons' stats from 13/14 ~ 18/19 EPL.

In [2]:
teams_seasons = []
for year in range(2014, 2020):
    url = 'https://www.pesmaster.com/english-league/pes-' + str(year) + '/league/9/'
    res = requests.get(url, headers={'User-agent': 'slsl'})
    
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find("table", {"id" : "search-result-table"})
    tr = table.find_all('tr')
    
    teams = []

    for i in range(1,(len(tr))):
            result = {}

            td = tr[i].find_all('td')
            
            if td:
                result['Team'] = td[0].text
                result['Ovr'] = td[1].text
                result['Def'] = td[2].text
                result['Mid'] = td[3].text
                result['Fwd'] = td[4].text
                result['Phy'] = td[5].text
                result['Spd'] = (td[6].text)

            if len(result) == 7:
                teams.append(result)
                
    time.sleep(3)
    
    teams_df = pd.DataFrame(teams, columns=(['Team', 'Ovr', 'Def', 'Mid', 'Fwd', 'Phy', 'Spd']))
    teams_df = teams_df.set_index('Team')
    # put all seasons into a list
    teams_seasons.append(teams_df)
    
# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
teams_dict = {str(2013 + i) : teams_seasons[i] for i in range(0,5)}

In [3]:
teams_dict['2013'].head()

Unnamed: 0_level_0,Ovr,Def,Mid,Fwd,Phy,Spd
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chelsea,86,87,85,87,81,78
Manchester City,86,85,86,92,82,79
Manchester United,85,83,85,89,79,78
Arsenal,84,84,85,82,78,80
Tottenham Hotspur,83,80,83,84,80,79


In [29]:
# list of the teams (18/19)
pes_team_dict = {
            'ARSENAL': 'Arsenal',
            'LIVERPOOL': 'Liverpool',
            'SOUTH WALES': 'Cardiff City',
            'MAN BLUE': 'Manchester City',
            'EAST DORSETSHIRE': 'Bournemouth',
            'WEST LONDON WHITE': 'Fulham',
            'EAST MIDLANDS': 'Leicester City',
            'SOUTH NORWOOD': 'Crystal Palace',
            'TYNESIDE': 'Newcastle Utd',
            'NORTH EAST LONDON': 'Tottenham',
            'EAST LONDON': 'West Ham Utd',
            'HAMPSHIRE RED': 'Southampton',
            'LANCASHIRE CLARET': 'Burnley',
            'HERTFORDSHIRE': 'Watford',
            'MAN RED': 'Manchester Utd',
            'EAST SUSSEX': 'Brighton & Hove',
            'LONDON FC': 'Chelsea',
            'MERSEYSIDE BLUE': 'Everton',
            'WM GOLD':'Wolverhampton',
            'WEST YORKSHIRE TOWN': 'Huddersfield',
            'WEST GLAMORGAN CITY': 'Swansea City'}

In [30]:
players = []

for page in range(1,18):
    url = 'http://pesdb.net/pes2019/?league=1&sort=club_team&order=a&page=' + str(page)
    res = requests.get(url, headers={'User-agent': 'slsl'})
    
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find("table", {"class" : "players"})
    tr = table.find_all('tr')

    for i in range(1,(len(tr))):
            result = {}

            td = tr[i].find_all('td')
            
            if td:
                result['Name'] = td[1].text
                result['Team'] = td[2].text
                result['Rating'] = (td[9].text)

            if len(result) == 3:
                players.append(result)
                
    time.sleep(3)
    
players_df = pd.DataFrame(players, columns=(['Name', 'Team', 'Rating']))
final_player = players_df.set_index('Team')
final_player.rename(index=pes_team_dict, inplace=True)


In [31]:
final_player.head()

Unnamed: 0_level_0,Name,Rating
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,P. AUBAMEYANG,88
Arsenal,M. ÖZIL,87
Arsenal,H. MKHITARYAN,85
Arsenal,A. LACAZETTE,85
Arsenal,B. LENO,85


### Step 2: Get EPL Tables of Last 5 Seasons (2013~2018)

In [24]:
season_tables = []

for year in range(2014, 2019):
    # load data from soccerstats.com by years (2013 will mean 13/14)
    table_url = 'https://www.soccerstats.com/widetable.asp?league=england_' + str(year)
    res = requests.get(table_url, headers={'User-agent': 'slsl'})

    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    
    # get a table
    table = soup.find('table', {'id': 'btable'})
    tr = table.find_all('tr', {'class': 'trow8'})

    final_table = []

    for i in range(0,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[1].text.strip()     # team name
            result['P'] = int(td[2].text)           # number of games
            result['W'] = int(td[3].text)           # wins
            result['D'] = int(td[4].text)           # draws
            result['L'] = int(td[5].text)           # loses
            result['HW'] = int(td[13].text)         # home wins
            result['HGF'] = int(td[16].text)        # home goals scored
            result['HGA'] = int(td[17].text)        # home goals conceded
            result['AGF'] = int(td[27].text)        # away goals scored
            result['AGA'] = int(td[28].text)        # away goals conceded
            result['AW'] = int(td[24].text)         # away wins
            result['GF'] = int(td[6].text)          # goals for
            result['GA'] = int(td[7].text)          # goals allowed
            result['PTS'] = int(td[9].text)         # total points end of the season

            # put them into one dictionary
            if len(result) == 14:
                final_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    final_table = pd.DataFrame(final_table, columns=(['Team', 'P', 'W', 'D', 'L', 'GF', 'GA', 'HW', 'HGF', 'HGA', 'AW', 'AGF', 'AGA', 'PTS']))
    
    # Create features to generate poisson distribution
    final_table['H_Att'] = final_table['HGF'] / 19 / (final_table['HGF'].sum() / 380)
    final_table['A_Att'] = final_table['AGF'] / 19 / (final_table['HGA'].sum() / 380)
    final_table['H_Def'] = final_table['HGA'] / 19 / (final_table['HGA'].sum() / 380) #* table_13['DIS'] * table_13['CS'] / 38
    final_table['A_Def'] = final_table['AGA'] / 19 / (final_table['HGF'].sum() / 380) #* table_13['DIS'] * table_13['CS'] / 38
    
    # change index as a team names
    final_table = final_table.set_index('Team')
    # put all seasons into a list
    season_tables.append(final_table)
  
# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
dict_seasons = {str(2013 + i) : season_tables[i] for i in range(0,5)}

In [14]:
dict_seasons['2013'] # EPL Season 2013/2014

Unnamed: 0_level_0,P,W,D,L,GF,GA,HW,HGF,HGA,AW,AGF,AGA,PTS,H_Att,A_Att,H_Def,A_Def
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Manchester City,38,27,5,6,102,37,17,63,13,10,39,24,86,2.107023,1.718062,0.572687,0.802676
Liverpool,38,26,6,6,101,50,16,53,18,10,48,32,84,1.772575,2.114537,0.792952,1.070234
Chelsea,38,25,7,6,71,27,15,43,11,10,28,16,82,1.438127,1.23348,0.484581,0.535117
Arsenal,38,24,7,7,68,41,13,36,11,11,32,30,79,1.204013,1.409692,0.484581,1.003344
Everton,38,21,9,8,61,39,13,38,19,8,23,20,72,1.270903,1.013216,0.837004,0.668896
Tottenham,38,21,6,11,55,51,11,30,23,10,25,28,69,1.003344,1.101322,1.013216,0.936455
Manchester Utd,38,19,7,12,64,43,9,29,21,10,35,22,64,0.9699,1.54185,0.92511,0.735786
Southampton,38,15,11,12,54,46,8,32,23,7,22,23,56,1.070234,0.969163,1.013216,0.769231
Stoke City,38,13,11,14,45,52,10,27,17,3,18,35,50,0.90301,0.792952,0.748899,1.170569
Newcastle Utd,38,15,4,19,43,59,8,23,28,7,20,31,49,0.769231,0.881057,1.23348,1.036789


### Step 3: Getting Cleans Sheets

In [15]:
# list of the teams (18/19)
# data from three different sources use different names for teams. 
# manually making a dictionary to rename teams
team_dict = {
            'Arsenal FC': 'Arsenal',
            'Burnley FC': 'Burnley',
            'Liverpool FC': 'Liverpool',
            'Everton FC': 'Everton',
            'Fulham FC': 'Fulham',
            'Southampton FC': 'Southampton',
            'Chelsea FC': 'Chelsea',
            'West Bromwich Albion': 'West Bromwich',
            'Manchester United': 'Manchester Utd',
            'Manchester City': 'Manchester City', 
            'Newcastle United': 'Newcastle Utd',
            'West Ham United': 'West Ham Utd', 
            'Tottenham Hotspur': 'Tottenham',
            'Queens Park Rangers': 'QP Rangers',
            'Watford FC': 'Watford',
            'AFC Bournemouth': 'Bournemouth',
            'Sunderland AFC': 'Sunderland',
            'Middlesbrough FC': 'Middlesbrough',
            'Brighton & Hove Albion': 'Brighton & Hove',
            'Huddersfield Town': 'Huddersfield'
            }

In [16]:
cs_tables = []
seasons = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18']
for year in seasons:
    #load data from soccerstats.com by years (2013 will mean 13/14)
    table_url = 'https://www.sportsmole.co.uk/football/premier-league/' + year + '/best-defence.html'
    res = requests.get(table_url, headers={'User-agent': 'slee'})
   
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break
        
    soup = BeautifulSoup(res.content, 'lxml')

    # get a table
    table = soup.find('table', {'class':'leaguetable full'})
    tr = table.find_all('tr')
    cs_table = []

    for i in range(1,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[1].text.strip()          # goals allowed
            result['CS'] = int(td[4].text)         # total points end of the season

            # put them into one dictionary
            if len(result) == 2:
                cs_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    cs_table = pd.DataFrame(cs_table, columns=(['Team', 'CS']))
    # change index as a team names
    cs_table = cs_table.set_index('Team')
    cs_table.rename(index=team_dict, inplace=True)
    # put all seasons into a list
    cs_tables.append(cs_table)

# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
dict_cs = {str(2013 + i) : cs_tables[i] for i in range(0,5)}

In [17]:
dict_cs['2013'].head()

Unnamed: 0_level_0,CS
Team,Unnamed: 1_level_1
Chelsea,18
Manchester City,16
Everton,15
Arsenal,17
Manchester Utd,13


### Step 4: Getting 'Discipline' Points

In [18]:
dis_tables = []

for year in range(2013, 2018):
    # load data from transfermarkt.co.uk by years (2013 means 13/14)
    table_url = 'https://www.transfermarkt.co.uk/premier-league/fairnesstabelle/wettbewerb/GB1/saison_id/' + str(year) + '/plus/1'
    res = requests.get(table_url, headers={'User-agent': 'slsl'})

    if res.status_code != 200:
        print('Status not 200', res.status_code)

    soup = BeautifulSoup(res.content, 'lxml')

    # get a table
    table = soup.find('table', {'class': 'items'})
    tbody = table.find('tbody')
    tr = tbody.find_all('tr')

    dis_table = []

    for i in range(0,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[2].text.strip()     # team name
            result['P'] = int(td[3].text)           # games played
            result['YC'] = int(td[5].text)          # total number of yellow cards during the season
            result['RC'] = int(td[8].text)          # red cards
            # DIS = (0.5 * YC + 2 * RC) / P
            # higher 'DIS', the team is more likely to have cards during a match
            result['DIS'] = (int(td[5].text) * 0.5 + int(td[8].text) * 2) / int(td[3].text)    

            # put them into one dictionary
            if len(result) == 5:
                dis_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    dis_table = pd.DataFrame(dis_table, columns=(['Team', 'P', 'YC', 'RC', 'DIS']))
    # change index as a team names
    dis_table = dis_table.set_index('Team')
    dis_table.rename(index=team_dict, inplace=True)
    # put all seasons into a list
    dis_tables.append(dis_table)
    
# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
dict_dis = {str(2013 + i) : dis_tables[i] for i in range(0,5)}

In [19]:
dict_dis['2013'].head()

Unnamed: 0_level_0,P,YC,RC,DIS
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cardiff City,38,49,1,0.697368
Liverpool,38,53,1,0.75
Everton,38,54,1,0.763158
Fulham,38,56,1,0.789474
Southampton,38,60,0,0.789474


### Step 4: Merge Standing and Discipline

In [20]:
# merge dict_seasons, dict_dis
temp_pd = []
for year in range(2013, 2018):
    temp_pd.append(pd.merge(dict_seasons[str(year)], dict_dis[str(year)][['YC', 'RC', 'DIS']], left_index=True, right_index=True, how='outer'))
dict_past_seasons = {str(2013 + i) : temp_pd[i] for i in range(0,5)}

new_temp = []

for year in range(2013, 2018):
    new_temp.append(pd.merge(dict_past_seasons[str(year)], dict_cs[str(year)][['CS']], left_index=True, right_index=True, how='outer'))

dict_final_seasons = {str(2013 + i) : new_temp[i] for i in range(0,5)}



In [21]:
dict_final_seasons['2013'].head()

Unnamed: 0_level_0,P,W,D,L,GF,GA,HW,HGF,HGA,AW,...,AGA,PTS,H_Att,A_Att,H_Def,A_Def,YC,RC,DIS,CS
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arsenal,38,24,7,7,68,41,13,36,11,11,...,30,79,1.204013,1.409692,0.484581,1.003344,51,4,0.881579,17
Aston Villa,38,10,8,20,39,61,6,22,29,4,...,32,38,0.735786,0.748899,1.277533,1.070234,78,0,1.026316,9
Cardiff City,38,7,9,22,32,74,5,20,35,2,...,39,30,0.668896,0.528634,1.54185,1.304348,49,1,0.697368,7
Chelsea,38,25,7,6,71,27,15,43,11,10,...,16,82,1.438127,1.23348,0.484581,0.535117,54,3,0.868421,18
Crystal Palace,38,13,6,19,33,48,8,18,23,5,...,25,45,0.602007,0.660793,1.013216,0.83612,59,2,0.881579,12


In [23]:
for season in range(2013, 2018):
    dict_final_seasons[str(season)].to_csv('./Data/epl_table_' + str(season-2000) + str(season-1999) + '.csv')

### Step 5: Create a Scoring Model (Poisson Distribution)

In [57]:
def score_percentage(dataframe, hometeam, awayteam):
    home_avg = dataframe['HGF'].sum()/380
    away_avg = dataframe['HGA'].sum()/380
    
    home_score = float(dataframe[dataframe.index == hometeam]['H_Att']) * float(dataframe[dataframe.index == awayteam]['A_Def']) * home_avg
    away_score = float(dataframe[dataframe.index == awayteam]['A_Att']) * float(dataframe[dataframe.index == hometeam]['H_Def']) * away_avg
    print(home_score, away_score)
    score = []
    # maximum score for a team is 5
    for goals in range(0, 6):
        scores = {}
        scores['Home'] = (poisson.pmf(goals, home_score) * 100) # Hometeam score
        scores['Away'] = (poisson.pmf(goals, away_score) * 100) # away score

        if len(scores) == 2:
                    score.append(scores)
    score = pd.DataFrame(score, columns=(['Home', 'Away']))

#print(dict_final_seasons.values())
count=0
for season_df in dict_final_seasons.values():
    #print(season_df.index)
    for i in season_df.index:
        if i != 'Arsenal':
            score_percentage(season_df, 'Arsenal', i)
            count+=1
print(count)

2.0278120049287094 0.43357291908184564
2.4713958810068646 0.3060514722930675
1.0139060024643547 0.7141201020171575
1.5842281288505544 0.38256434036633435
1.2673825030804433 0.5865986552283794
2.9783488822390423 0.40806862972409
2.0278120049287094 0.45907720843960115
2.0278120049287094 1.22420588917227
1.520859003696532 0.9946672849524695
1.394120753388488 0.8926501275214468
1.9644428797746871 0.5100857871551124
2.788241506776976 0.2805471829353119
1.4574898785425099 0.5610943658706238
2.2179193803907755 0.45907720843960115
2.0911811300827314 0.5100857871551124
1.7743355043126208 0.5355900765128682
1.7743355043126208 0.6376072339438906
2.0278120049287094 0.4845814977973569
1.5842281288505544 0.38256434036633435
2.4661654135338344 0.46163601775523144
2.4661654135338344 0.49714648065948
1.772556390977444 1.3138871274571973
1.8496240601503757 0.9232720355104629
2.234962406015038 0.74571972098922
2.080827067669173 0.49714648065948
2.543233082706767 0.6391883322764743
2.1578947368421053 0.78

In [55]:
count

1900

In [56]:
len(dict_final_seasons)

5

In [None]:
5 * 