## Web Scraping

The following code scrapes data from the premier league website. There are a total of 42 stats (some of which may need to be discarded) that it collects for each team from season 2006-2007 until 2017-2018, making it an approximate total of 504 web requests performed. These stats will serve as features of a dataset that'll be fed into an ANN and RNN.

In [None]:
import os
import json
import requests
import progressbar
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

## Page

In [None]:
webpage = requests.get('https://www.premierleague.com/stats/top/clubs/wins?se=79')
soup = BeautifulSoup(webpage.text, 'html.parser')

if not os.path.exists('files'):
    os.makedirs('files/stats')
    os.makedirs('files/results')

## Links

**select** is used instead of find_all since it allows the use of javascript selectors <br>

------

**General** <br>
*index 0-5* <br>
wins, losses, goals, yellow cards, red cards, substitutions on

**Attack** <br>
*index 6-15* <br>
shots, shots on target, hit woodwork, goals from header, goals from penalty, goals from free kick, goals from inside box, goals from outside box, goals from counter attack, offsides

**Defence** <br>
*index 16-29* <br>
clean sheets, goals conceded, saves, blocks, interceptions, tackles, last man tackles, clearances, headed clearances, caught opponent offside, own goals, penalties conceded, goals conceded from penalty, fouls

**Team Play** <br>
*index 30-35* <br>
passes, through balls, long passes, backwards passes, crosses, corners taken

**Others** <br>
*index 36-42 <br>
non-duplicates attributes from top i.e. don't appear in more*  <br>
touches, big chances missed, clearances off line, dispossessed, penalties saved, high claims, punches

In [None]:
def attributes(links):
    return [link[link.rfind('/')+1:] for link in links]

def uniques(links):
    l = []
    for link in links:
        if link not in l:
            l.append(link)
    return l

top = [link['href'] for link in soup.select('a.topStatsLink')]
more = [link['href'] for link in soup.select('nav.moreStatsMenu a')]
links = uniques(attributes(more) + attributes(top))

## Dates

Data is being collected from the 2006/2007 season since detailed and constant stats were collected from then onwards.

In [None]:
dates = {'2006-2007':15, '2007-2008':16, '2008-2009':17, '2009-2010':18, 
         '2010-2011':19, '2011-2012':20, '2012-2013':21, '2013-2014':22, 
         '2014-2015':27, '2015-2016':42, '2016-2017':54, '2017-2018':79, 
         '2018-2019':210}

## Data

### Stats

In [None]:
for date in dates.keys():

    df = pd.DataFrame()
    bar = progressbar.ProgressBar(maxval=len(links), widgets=[date + '\t', progressbar.Bar('-', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    for i, attribute in zip(range(len(links)), links):

        # setup
        api = 'https://footballapi.pulselive.com/football/stats/ranked/teams/' + attribute
        headers = {'Origin': 'https://www.premierleague.com'}
        params = {'page': '0', 'pageSize': '20', 'compSeasons': dates[date], 'comps': '1', 'altIds': 'true'}

        # request
        response = requests.get(api, params=params, headers=headers)
        data = json.loads(response.text)

        # parse
        teams = []; values = [];
        for team in data['stats']['content']:
            teams.append(team['owner']['name'])
            values.append(team['value'])
        series = pd.Series(values, teams, float, attribute)
        if df.index.empty:
            df = pd.DataFrame(series)
        else:
            df = df.join(series)

        # progress
        bar.update(i+1)

    bar.finish()
    df.dropna(axis=1, how='all', inplace=True)
    df.fillna(0, inplace=True)
    df.to_csv('files/stats/' + date + '.csv')

#### Validation

In [None]:
datasets = []
for date in dates.keys():
    dataset = pd.read_csv('files/stats/' + date + '.csv', index_col=0)
    datasets.append(dataset)
    print(date + '\t' + str(len(dataset.columns)))

The above shows the number of statistics collected for each team for each of the listed seasons. We can see that there only seems to be consistency from season 2010-2011 onwards because there are the same number of stats collected. To validate that, we'll see if the stats are the same ones collected.

In [None]:
mismatches = []

for i in range(4,12):
    for j in range(i+1,12):
        if datasets[i].columns.tolist() != datasets[j].columns.tolist():
            mismatches.append((i,j))
            
if len(mismatches) == 0:
    print('Valid Set')
else:
    print('Invalid Set')

### Results

In [None]:
def get_team_ids(date):
    # setup
    api = 'https://footballapi.pulselive.com/football/compseasons/' + str(dates[date]) + '/teams'
    headers = {'Origin': 'https://www.premierleague.com'}
    
    # request
    response = requests.get(api, headers=headers)
    teams = json.loads(response.text)
    
    # parse
    team_ids = []
    for team in teams:
        team_ids.append(int(team['id']))
    team_ids = ','.join(map(str, team_ids))
    
    return team_ids
    
def get_results(date, team_ids):
    # setup
    api = 'https://footballapi.pulselive.com/football/fixtures'
    headers = {'Origin': 'https://www.premierleague.com'}
    params = {'comps':'1', 'compSeasons':dates[date], 'teams':team_ids, 'page':'0', 'pageSize':'380', 'sort':'asc', 'statuses':'C', 'altIds':'true'}

    # request
    response = requests.get(api, params=params, headers=headers)
    results = json.loads(response.text)
    
    # parse
    df = pd.DataFrame(columns=['home_team', 'away_team', 'home_goals', 'away_goals', 'result'])
    for result in results['content']:
        row = []
        row.append(result['teams'][0]['team']['name'])
        row.append(result['teams'][1]['team']['name'])
        row.append(result['teams'][0]['score'])
        row.append(result['teams'][1]['score'])
        row.append(result['outcome'])
        row = pd.Series(row, index=df.columns)
        df = df.append(row, ignore_index=True)
    
    return df

bar = progressbar.ProgressBar(maxval=len(dates), widgets=['', '\t', progressbar.Bar('-', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
for i, date in zip(range(len(dates)), dates.keys()):
    bar.widgets[0] = date
    team_ids = get_team_ids(date)
    results = get_results(date, team_ids)
    bar.update(i+1)
    results.to_csv('files/results/' + date + '.csv')
bar.finish()

### Concatenation

Concatenating the discrete sets for each season into one big set that contains all seasons - one for stats and another for results

In [None]:
files = ['2006-2007.csv', '2007-2008.csv', '2008-2009.csv', '2009-2010.csv', '2010-2011.csv', '2011-2012.csv', '2012-2013.csv', '2013-2014.csv', '2014-2015.csv', '2015-2016.csv', '2016-2017.csv', '2017-2018.csv', '2018-2019.csv']

stats_df = pd.DataFrame()
results_df = pd.DataFrame()
for name in files:
    
    # Stats
    f = 'files/stats/' + name
    stats_series = pd.Series([name[:-4]]*20, name='season')
    stats_season = pd.concat([pd.read_csv(f, index_col=False), stats_series], axis=1)
    columns = stats_season.columns.tolist()
    columns[0] = 'team'
    stats_season.columns = columns
    if stats_df.empty:
        stats_df = stats_season
    else:
        stats_df = pd.concat([stats_df, stats_season])
        
    # Results
    f = 'files/results/' + name
    results_series = pd.Series([name[:-4]]*380, name='season')
    results_season = pd.concat([pd.read_csv(f), results_series], axis=1)
    if results_df.empty:
        results_df = results_season
    else:
        results_df = pd.concat([results_df, results_season])
    
stats_df = stats_df[stats_season.columns.tolist()]
stats_df.to_csv('files/stats/stats.csv', index=False)

results_df.drop(results_df.columns[0], axis=1, inplace=True)
results_df.to_csv('files/results/results.csv', index=False)