In [1]:
# import libaries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import time

### Step 1: Get EPL Team Overall Stats from web
Get EPL teams' stats using database in www.pesmaster.com.
The result dictionary will have 6 seasons' stats from 13/14 ~ 18/19 EPL.

In [2]:
teams_seasons = []
for year in range(2014, 2020):
    url = 'https://www.pesmaster.com/english-league/pes-' + str(year) + '/league/9/'
    res = requests.get(url, headers={'User-agent': 'slsl'})
    
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find("table", {"id" : "search-result-table"})
    tr = table.find_all('tr')
    
    teams = []

    for i in range(1,(len(tr))):
            result = {}

            td = tr[i].find_all('td')
            
            if td:
                result['Team'] = td[0].text
                result['Ovr'] = td[1].text
                result['Def'] = td[2].text
                result['Mid'] = td[3].text
                result['Fwd'] = td[4].text
                result['Phy'] = td[5].text
                result['Spd'] = (td[6].text)

            if len(result) == 7:
                teams.append(result)
                
    time.sleep(3)
    
    teams_df = pd.DataFrame(teams, columns=(['Team', 'Ovr', 'Def', 'Mid', 'Fwd', 'Phy', 'Spd']))
    teams_df = teams_df.set_index('Team')
    # put all seasons into a list
    teams_seasons.append(teams_df)
    
# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
teams_dict = {str(2013 + i) : teams_seasons[i] for i in range(0,5)}

In [3]:
teams_dict['2013'].head()

Unnamed: 0_level_0,Ovr,Def,Mid,Fwd,Phy,Spd
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chelsea,86,87,85,87,81,78
Manchester City,86,85,86,92,82,79
Manchester United,85,83,85,89,79,78
Arsenal,84,84,85,82,78,80
Tottenham Hotspur,83,80,83,84,80,79


In [4]:
# list of the teams (18/19)
pes_team_dict = {
            'ARSENAL': 'Arsenal',
            'LIVERPOOL': 'Liverpool',
            'SOUTH WALES': 'Cardiff City',
            'MAN BLUE': 'Manchester City',
            'EAST DORSETSHIRE': 'Bournemouth',
            'WEST LONDON WHITE': 'Fulham',
            'EAST MIDLANDS': 'Leicester City',
            'SOUTH NORWOOD': 'Crystal Palace',
            'TYNESIDE': 'Newcastle Utd',
            'NORTH EAST LONDON': 'Tottenham',
            'EAST LONDON': 'West Ham Utd',
            'HAMPSHIRE RED': 'Southampton',
            'LANCASHIRE CLARET': 'Burnley',
            'HERTFORDSHIRE': 'Watford',
            'MAN RED': 'Manchester Utd',
            'EAST SUSSEX': 'Brighton & Hove',
            'LONDON FC': 'Chelsea',
            'MERSEYSIDE BLUE': 'Everton',
            'WM GOLD':'Wolverhampton',
            'WEST YORKSHIRE TOWN': 'Huddersfield',
            'WEST GLAMORGAN CITY': 'Swansea City'}

In [5]:
players = []

for page in range(1,18):
    url = 'http://pesdb.net/pes2019/?league=1&sort=club_team&order=a&page=' + str(page)
    res = requests.get(url, headers={'User-agent': 'slsl'})
    
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    table = soup.find("table", {"class" : "players"})
    tr = table.find_all('tr')

    for i in range(1,(len(tr))):
            result = {}

            td = tr[i].find_all('td')
            
            if td:
                result['Name'] = td[1].text
                result['Team'] = td[2].text
                result['Rating'] = (td[9].text)

            if len(result) == 3:
                players.append(result)
                
    time.sleep(3)
    
players_df = pd.DataFrame(players, columns=(['Name', 'Team', 'Rating']))
final_player = players_df.set_index('Team')
final_player.rename(index=pes_team_dict, inplace=True)


In [6]:
final_player.head()

Unnamed: 0_level_0,Name,Rating
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Arsenal,P. AUBAMEYANG,88
Arsenal,M. ÖZIL,87
Arsenal,H. MKHITARYAN,85
Arsenal,A. LACAZETTE,85
Arsenal,B. LENO,85


### Step 2: Get EPL Tables of Last 5 Seasons (2013~2018)

In [7]:
season_tables = []

for year in range(2014, 2019):
    # load data from soccerstats.com by years (2013 will mean 13/14)
    table_url = 'https://www.soccerstats.com/widetable.asp?league=england_' + str(year)
    res = requests.get(table_url, headers={'User-agent': 'slsl'})

    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break

    soup = BeautifulSoup(res.content, 'lxml')
    
    # get a table
    table = soup.find('table', {'id': 'btable'})
    tr = table.find_all('tr', {'class': 'trow8'})

    final_table = []

    for i in range(0,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[1].text.strip()     # team name
            result['P'] = int(td[2].text)           # number of games
            result['W'] = int(td[3].text)           # wins
            result['D'] = int(td[4].text)           # draws
            result['L'] = int(td[5].text)           # loses
            result['HW'] = int(td[13].text)         # home wins
            result['AW'] = int(td[24].text)         # away wins
            result['GF'] = int(td[6].text)          # goals for
            result['GA'] = int(td[7].text)          # goals allowed
            result['PTS'] = int(td[9].text)         # total points end of the season

            # put them into one dictionary
            if len(result) == 10:
                final_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    final_table = pd.DataFrame(final_table, columns=(['Team', 'P', 'W', 'D', 'L','HW', 'AW', 'GF', 'GA', 'PTS']))
    # change index as a team names
    final_table = final_table.set_index('Team')
    # put all seasons into a list
    season_tables.append(final_table)
    
# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
dict_seasons = {str(2013 + i) : season_tables[i] for i in range(0,5)}

In [8]:
dict_seasons['2013'].head() # EPL Season 2013/2014

Unnamed: 0_level_0,P,W,D,L,HW,AW,GF,GA,PTS
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Manchester City,38,27,5,6,17,10,102,37,86
Liverpool,38,26,6,6,16,10,101,50,84
Chelsea,38,25,7,6,15,10,71,27,82
Arsenal,38,24,7,7,13,11,68,41,79
Everton,38,21,9,8,13,8,61,39,72


### Step 3: Getting Cleans Sheets

In [9]:
# list of the teams (18/19)
# data from three different sources use different names for teams. 
# manually making a dictionary to rename teams
team_dict = {
            'Arsenal FC': 'Arsenal',
            'Burnley FC': 'Burnley',
            'Liverpool FC': 'Liverpool',
            'Everton FC': 'Everton',
            'Fulham FC': 'Fulham',
            'Southampton FC': 'Southampton',
            'Chelsea FC': 'Chelsea',
            'West Bromwich Albion': 'West Bromwich',
            'Manchester United': 'Manchester Utd',
            'Manchester City': 'Manchester City', 
            'Newcastle United': 'Newcastle Utd',
            'West Ham United': 'West Ham Utd', 
            'Tottenham Hotspur': 'Tottenham',
            'Queens Park Rangers': 'QP Rangers',
            'Watford FC': 'Watford',
            'AFC Bournemouth': 'Bournemouth',
            'Sunderland AFC': 'Sunderland',
            'Middlesbrough FC': 'Middlesbrough',
            'Brighton & Hove Albion': 'Brighton & Hove',
            'Huddersfield Town': 'Huddersfield'
            }

In [10]:
cs_tables = []
seasons = ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18']
for year in seasons:
    #load data from soccerstats.com by years (2013 will mean 13/14)
    table_url = 'https://www.sportsmole.co.uk/football/premier-league/' + year + '/best-defence.html'
    res = requests.get(table_url, headers={'User-agent': 'slee'})
   
    if res.status_code != 200:
        print('Status not 200', res.status_code)
        break
        
    soup = BeautifulSoup(res.content, 'lxml')

    # get a table
    table = soup.find('table', {'class':'leaguetable full'})
    tr = table.find_all('tr')
    cs_table = []

    for i in range(1,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[1].text.strip()          # goals allowed
            result['CS'] = int(td[4].text)         # total points end of the season

            # put them into one dictionary
            if len(result) == 2:
                cs_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    cs_table = pd.DataFrame(cs_table, columns=(['Team', 'CS']))
    # change index as a team names
    cs_table = cs_table.set_index('Team')
    cs_table.rename(index=team_dict, inplace=True)
    # put all seasons into a list
    cs_tables.append(cs_table)

# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
dict_cs = {str(2013 + i) : cs_tables[i] for i in range(0,5)}

In [11]:
dict_cs['2013'].head()

Unnamed: 0_level_0,CS
Team,Unnamed: 1_level_1
Chelsea,18
Manchester City,16
Everton,15
Arsenal,17
Manchester Utd,13


### Step 4: Getting 'Discipline' Points

In [12]:
dis_tables = []

for year in range(2013, 2018):
    # load data from transfermarkt.co.uk by years (2013 means 13/14)
    table_url = 'https://www.transfermarkt.co.uk/premier-league/fairnesstabelle/wettbewerb/GB1/saison_id/' + str(year) + '/plus/1'
    res = requests.get(table_url, headers={'User-agent': 'slsl'})

    if res.status_code != 200:
        print('Status not 200', res.status_code)

    soup = BeautifulSoup(res.content, 'lxml')

    # get a table
    table = soup.find('table', {'class': 'items'})
    tbody = table.find('tbody')
    tr = tbody.find_all('tr')

    dis_table = []

    for i in range(0,(len(tr))):
        result = {}

        td = tr[i].find_all('td')

        if td:
            result['Team'] = td[2].text.strip()     # team name
            result['P'] = int(td[3].text)           # games played
            result['YC'] = int(td[5].text)          # total number of yellow cards during the season
            result['RC'] = int(td[8].text)          # red cards
            # DIS = (0.5 * YC + 2 * RC) / P
            # higher 'DIS', the team is more likely to have cards during a match
            result['DIS'] = (int(td[5].text) * 0.5 + int(td[8].text) * 2) / int(td[3].text)    

            # put them into one dictionary
            if len(result) == 5:
                dis_table.append(result)

    time.sleep(3)

    # put dictionary into a pandas dataframe with column names
    dis_table = pd.DataFrame(dis_table, columns=(['Team', 'P', 'YC', 'RC', 'DIS']))
    # change index as a team names
    dis_table = dis_table.set_index('Team')
    dis_table.rename(index=team_dict, inplace=True)
    # put all seasons into a list
    dis_tables.append(dis_table)
    
# make a list as a dictionary that has keys = season (2013, 2014, ..., 2017)
dict_dis = {str(2013 + i) : dis_tables[i] for i in range(0,5)}

In [13]:
dict_dis['2013'].head()

Unnamed: 0_level_0,P,YC,RC,DIS
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cardiff City,38,49,1,0.697368
Liverpool,38,53,1,0.75
Everton,38,54,1,0.763158
Fulham,38,56,1,0.789474
Southampton,38,60,0,0.789474


### Step 4: Merge Standing and Discipline

In [14]:
# merge dict_seasons, dict_dis
temp_pd = []
for year in range(2013, 2018):
    temp_pd.append(pd.merge(dict_seasons[str(year)], dict_dis[str(year)][['YC', 'RC', 'DIS']], left_index=True, right_index=True, how='outer'))
dict_past_seasons = {str(2013 + i) : temp_pd[i] for i in range(0,5)}

new_temp = []

for year in range(2013, 2018):
    new_temp.append(pd.merge(dict_past_seasons[str(year)], dict_cs[str(year)][['CS']], left_index=True, right_index=True, how='outer'))

dict_final_seasons = {str(2013 + i) : new_temp[i] for i in range(0,5)}



In [15]:
dict_final_seasons['2016'].head()

Unnamed: 0_level_0,P,W,D,L,HW,AW,GF,GA,PTS,YC,RC,DIS,CS
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Arsenal,38,23,6,9,14,9,77,44,75,68,3,1.052632,13
Bournemouth,38,12,10,16,9,3,55,67,46,50,3,0.815789,10
Burnley,38,11,7,20,10,1,39,55,40,64,2,0.947368,10
Chelsea,38,30,3,5,17,13,85,33,93,72,0,0.947368,16
Crystal Palace,38,12,5,21,6,6,50,63,41,78,0,1.026316,7


In [16]:
for season in range(2013, 2018):
    dict_final_seasons[str(year)].to_csv('./Data/epl_table_' + str(season-2000) + str(season-1999) + '.csv')

In [17]:
# fixtures api
import http.client
import json

connection = http.client.HTTPConnection('api.football-data.org')
headers = { 'X-Auth-Token': '2e2b3eb0a1a94e2cb0877d6658cdff02' }
connection.request('GET', '/v2/competitions/PL/matches?season=2017', None, headers )
response = json.loads(connection.getresponse().read().decode())

print (response)

{'count': 380, 'filters': {}, 'competition': {'id': 2021, 'area': {'id': 2072, 'name': 'England'}, 'name': 'Premier League', 'code': 'PL', 'plan': 'TIER_ONE', 'lastUpdated': '2018-09-29T00:00:16Z'}, 'matches': [{'id': 205156, 'season': {'id': 23, 'startDate': '2017-08-11', 'endDate': '2018-05-13', 'currentMatchday': 38}, 'utcDate': '2017-08-11T18:45:00Z', 'status': 'FINISHED', 'matchday': 1, 'stage': 'REGULAR_SEASON', 'group': 'Regular Season', 'lastUpdated': '2018-06-22T10:03:56Z', 'homeTeam': {'id': 57, 'name': 'Arsenal FC'}, 'awayTeam': {'id': 338, 'name': 'Leicester City FC'}, 'score': {'winner': 'HOME_TEAM', 'duration': 'REGULAR', 'fullTime': {'homeTeam': 4, 'awayTeam': 3}, 'halfTime': {'homeTeam': 2, 'awayTeam': 2}, 'extraTime': {'homeTeam': None, 'awayTeam': None}, 'penalties': {'homeTeam': None, 'awayTeam': None}}, 'referees': [{'id': 11575, 'name': 'Mike Dean', 'nationality': None}, {'id': 11504, 'name': 'Simon Long', 'nationality': None}, {'id': 11576, 'name': 'Darren Cann', 

### Step 5: Create a Scoring Model (Poisson Distribution)

In [18]:
# importing the tools required for the Poisson regression model
import statsmodels.api as sm
import statsmodels.formula.api as smf

goal_model_data = pd.concat([epl_1617[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
            columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}),
           epl_1617[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
            columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])

poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()
poisson_model.summary()

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
# Create Match Simulating Function

def simulate_match(model, homeTeam, awayTeam, max_goals=10):
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'home':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'home':0},
                                                      index=[1])).values[0]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))
simulate_match(poisson_model, 'Chelsea', 'Sunderland', max_goals=3)

In [20]:
!pip install sklearn

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Collecting scikit-learn (from sklearn)
  Downloading https://files.pythonhosted.org/packages/ba/23/d66465912a1c7d84e5215903dcf89ade2a835f852f2941c66ed7fd176d58/scikit_learn-0.20.0-cp36-cp36m-win32.whl (4.3MB)
Collecting scipy>=0.13.3 (from scikit-learn->sklearn)
  Downloading https://files.pythonhosted.org/packages/30/2a/8bd20295c774e3f19b5f8b71d75ef7e802673852ca3ae2e1d231d0f1c7a2/scipy-1.1.0-cp36-none-win32.whl (26.3MB)
Installing collected packages: scipy, scikit-learn, sklearn
  Running setup.py install for sklearn: started
    Running setup.py install for sklearn: finished with status 'done'
Successfully installed scikit-learn-0.20.0 scipy-1.1.0 sklearn-0.0
