In [242]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import os
from datetime import date

In [243]:
base_url = 'https://understat.com/league'
la_liga = 'La_liga'
epl = 'EPL'
bundesliga = 'Bundesliga'
serie_a = 'Serie_A'
ligue_1 = 'Ligue_1'

leagues = [la_liga, epl, bundesliga, serie_a, ligue_1]

# The directory in which all of the league json data is stored.
league_data_directory = 'league_data'
if not os.path.exists(league_data_directory):
    os.mkdir(league_data_directory)

In [254]:
def fetch_data(var_name, league, year):
    url = base_url + '/' + league + '/' + str(year)
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")

    # Data is stored in a variable inside a script tag
    scripts = soup.find_all('script')

    # Check <script> tags
    string_with_json_obj = ''

    # Find script with team data in it
    for el in scripts:
        if var_name in str(el):
            string_with_json_obj = str(el).strip()

    # Parse code to get the team data and convert to json
    ind_start = string_with_json_obj.index("('") + 2
    ind_end = string_with_json_obj.index("')")
    json_data = string_with_json_obj[ind_start:ind_end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    return json.loads(json_data)

# fetch_season_data fetches season data from the specified league and year.
# Retrieves data from request to understat.com.
# Returns: A json object, with keys being the team id, and the values being another json object with
# team id, title (team name), and history (a list of all of the team's games from that year, along with
# associated data).
def fetch_season_data(league, year):
    return fetch_data('teamsData', league, year)

def get_team_name(teams, team_id):
    for team in teams:
        if team['id'] == team_id:
            return team['name']
    return None

def get_all_fixtures():
    base_url = 'https://fantasy.premierleague.com/api/bootstrap-static/'
    response = requests.get(base_url)
    teams = response.json()['teams']

    fixtures_url = 'https://fantasy.premierleague.com/api/fixtures/'
    response = requests.get(fixtures_url)
    fixtures = []
    fixtures_json = response.json()
    for fixture in fixtures_json:
        home_team = fixture['team_h']
        away_team = fixture['team_a']
        fixtures.append({
            'gw': fixture['event'],
            'home_team': get_team_name(teams, home_team),
            'away_team': get_team_name(teams, away_team)
        })
            
    return fixtures

def get_gw_fixtures(fixtures, gw):
    return [f for f in fixtures if f['gw'] == gw]


def fetch_fixtures(league, year):
    return fetch_data('datesData', league, year)

def clean_fixtures(fixtures):
    aliases = {
        'Man Utd': 'Manchester United',
        'Newcastle': 'Newcastle United',
        'Spurs': 'Tottenham',
        'Wolves': 'Wolverhampton Wanderers',
        'Man City': 'Manchester City',
    }
    for f in fixtures:
        if f['home_team'] in aliases:
            f['home_team'] = aliases[f['home_team']]
        if f['away_team'] in aliases:
            f['away_team'] = aliases[f['away_team']]

def fetch_pl_2021_fixtures():
    statsbomb_fixtures = fetch_data('datesData', epl, 2021)
    fixtures = get_all_fixtures()
    clean_fixtures(fixtures)
    for s_f in statsbomb_fixtures:
        for f in fixtures:
            if s_f['h']['title'] == f['home_team'] and s_f['a']['title'] == f['away_team']:
                f['home_xG'] = s_f['xG']['h']
                f['away_xG'] = s_f['xG']['a']
                if 'forecast' in s_f:
                    f['home_xP'] = float(s_f['forecast']['w']) * 3 + float(s_f['forecast']['d'])
                    f['away_xP'] = float(s_f['forecast']['l']) * 3 + float(s_f['forecast']['d'])
                else:
                    f['home_xP'] = None
                    f['away_xP'] = None
                
    return fixtures

def get_league_directory(league_name, year):
    directory_name = league_name + '_' + str(year)
    directory_path = os.path.join(league_data_directory, directory_name)
    if not os.path.exists(directory_path):
        os.mkdir(directory_path)
    return directory_path

def get_league_data_file(league_name, year):
    directory = get_league_directory(league_name, year)
    file_name = league_name + '_data_' + str(year) + '.json'
    return os.path.join(directory, file_name)

def get_fixtures_file(league_name, year):
    directory = get_league_directory(league_name, year)
    file_name = league_name + '_fixtures_' + str(year) + '.json'
    return os.path.join(directory, file_name)

# get_season_data gets season data from a file.
# Requires: file exists and season data is written to it.
def get_season_data(league_name, year):
    file_name = get_league_data_file(league_name, year)
    with open(file_name) as f:
        return json.load(f)

def get_fixtures(league_name, year):
    file_name = get_fixtures_file(league_name, year)
    with open(file_name) as f:
        return json.load(f)

# team_info is a map from team names to a map containing its team id and the league it participated in each season.
team_info = {}

# update_team_info updates the global team_info to specify that each team in [season_data] played in [league_name]
# in year [year]
def update_team_info(season_data, league_name, year):
    for team_id in season_data:
        # the title is the team name
        team_name = season_data[team_id]['title']
        if not team_name in team_info:
            team_info[team_name] = {
                'id': team_id,
                'league_history': {
                    year: league_name
                }
            }
        else:
            team_info[team_name]['league_history'][year] = league_name

# fetch_all_league_data creates a json file for the stats of each league from 2014/15 to 2021/22, if they do
# not already exist.
# Also updates the team info for each season, if needed.
def fetch_all_league_data():
    for league_name in leagues:
        for year in range(2014, 2022):
            data_file_name = get_league_data_file(league_name, year)
            if not os.path.exists(data_file_name):
                # File not found, so we need to fetch the data from understat
                print('fetching ' + league_name + ' data from ' + str(year), end=' ..... ')
                season_data = fetch_season_data(league_name, year)
                with open(data_file_name, 'w') as f:
                    json.dump(season_data, f)
                print('done')
            else:
                season_data = get_season_data(league_name, year)
            update_team_info(season_data, league_name, year)
            fixtures_file_name = get_fixtures_file(league_name, year)
            if not os.path.exists(fixtures_file_name):
                print('fetching ' + league_name + ' fixtures from ' + str(year), end=' ..... ')
                fixtures = fetch_fixtures(league_name, year)
                with open(fixtures_file_name, 'w') as f:
                    json.dump(fixtures, f)
                print('done')
                    
def update_fixture_data():
    today = date.today()
    season = today.year
    if today.month <= 7:
        season = season - 1;
    for league_name in leagues:
        fixtures_file_name = get_fixtures_file(league_name, season)
        print('fetching ' + league_name + ' fixtures from ' + str(season), end=' ..... ')
        fixtures = fetch_fixtures(league_name, season)
        with open(fixtures_file_name, 'w') as f:
            json.dump(fixtures, f)
        print('done')
    
    

fetch_all_league_data()
update_fixture_data()

pl_fixtures_file = get_fixtures_file(epl, 2021)
fixtures = fetch_pl_2021_fixtures()
with open(pl_fixtures_file, 'w') as f:
    json.dump(fixtures, f)

In [256]:
fixtures = get_fixtures(epl, 2021)
def get_x_fixtures_before_gw(fixtures, team_name, x, gw):
    fixture_data = []
    for f in fixtures:
        if f['gw'] >= gw - x and f['gw'] < gw:
            if f['home_team'] == team_name:
                fixture_data.append({
                    'xG': float(f['home_xG']),
                    'xGA': float(f['away_xG']),
                    'xP': float(f['home_xP'])
                })
            elif f['away_team'] == team_name:
                fixture_data.append({
                    'xG': float(f['away_xG']),
                    'xGA': float(f['home_xG']),
                    'xP': float(f['away_xP'])
                })
    return fixture_data

def get_cs_score(fixtures, team_name, opponent, gw):
    team_fixtures = get_x_fixtures_before_gw(fixtures, team_name, 8, gw)
    opponent_fixtures = get_x_fixtures_before_gw(fixtures, opponent, 8, gw)
    team_xGA_average = np.mean([f['xGA'] for f in team_fixtures])
    opponent_xG_average = np.mean([f['xG'] for f in opponent_fixtures])
    return team_xGA_average + opponent_xG_average

def get_prem_gw_cs_scores(fixtures, gw):
    columns = ['Team', 'Opposition', 'CS Score']
    data = []
    for fixture in get_gw_fixtures(fixtures, gw):
        home_team = fixture['home_team']
        away_team = fixture['away_team']
        home_cs_score = get_cs_score(fixtures, home_team, away_team, gw)
        away_cs_score = get_cs_score(fixtures, away_team, home_team, gw)
        data.append([home_team, away_team, home_cs_score])
        data.append([away_team, home_team, away_cs_score])
    df = pd.DataFrame(data, columns=columns)
    return df.sort_values(by=['CS Score'], ignore_index=True) 

get_prem_gw_cs_scores(fixtures, 2)

Unnamed: 0,Team,Opposition,CS Score
0,Everton,Leeds,1.148943
1,Manchester United,Southampton,1.148943
2,Chelsea,Arsenal,1.345551
3,Brentford,Crystal Palace,1.345551
4,Wolverhampton Wanderers,Tottenham,1.728222
5,West Ham,Leicester,2.347342
6,Manchester City,Norwich,2.39344
7,Watford,Brighton,2.82248
8,Aston Villa,Newcastle United,3.02962
9,Crystal Palace,Brentford,3.07527
