In [176]:
# import os
from pathlib import Path
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
from datetime import datetime

import xlsxwriter

import requests
import json

# current date
date = datetime.now().strftime("%Y%m%d-%H%M")
print(date)

20221107-1647


In [177]:
# create function to export to csv
def export_csv(df, filename):
    df.to_csv(f'../../output/{date}-{filename}.csv', encoding='utf-8')
    print(f"Exported {filename}")

In [178]:
# create function to export to excel as a table
def export_excel(df, sheet_name):
    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer = pd.ExcelWriter(f'../../output/{date}-output.xlsx', engine='xlsxwriter')
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer, sheet_name=sheet_name)
    # Get the xlsxwriter workbook and worksheet objects.
    workbook = writer.book
    worksheet = writer.sheets[sheet_name]
    # format as a table
    worksheet.add_table(0, 0, len(df), len(df.columns)-1, {'columns': [{'header': column} for column in df.columns]})
    writer.save()


In [179]:
# keep columns
keep_cols = [
    'id',
    'utcDate',
    'status',
    'matchday',
    # 'season.id',
    # 'season.currentMatchday',
    # 'homeTeam.id',
    # 'homeTeam.name',
    'homeTeam.shortName',
    # 'awayTeam.id',
    # 'awayTeam.name',
    'awayTeam.shortName',
    # 'score.winner',
    'score.fullTime.home',
    'score.fullTime.away',
    # 'score.halfTime.home',
    # 'score.halfTime.away'
]

In [180]:
# All of premier league history since 1992-08-15
# uri = 'https://api.football-data.org/v2/competitions/PL/matches?dateFrom=2021-08-01'

uri = 'http://api.football-data.org/v4/competitions/PL/matches?status=FINISHED'
headers = { 'X-Auth-Token': '73015e4f2665414cb1fafe97e8f9971e' }

response = requests.get(uri, headers=headers)
data = response.json()
df = pd.json_normalize(data, record_path = ['matches'])
df = df[keep_cols]
df['utcDate'] = pd.to_datetime(df['utcDate'])
df['score.fullTime'] = df['score.fullTime.home'].astype(str) + '-' + df['score.fullTime.away'].astype(str)
print(df.shape)
df.head()


(136, 9)


Unnamed: 0,id,utcDate,status,matchday,homeTeam.shortName,awayTeam.shortName,score.fullTime.home,score.fullTime.away,score.fullTime
0,416384,2022-08-05 19:00:00+00:00,FINISHED,1,Crystal Palace,Arsenal,0,2,0-2
1,416383,2022-08-06 11:30:00+00:00,FINISHED,1,Fulham,Liverpool,2,2,2-2
2,416378,2022-08-06 14:00:00+00:00,FINISHED,1,Tottenham,Southampton,4,1,4-1
3,416379,2022-08-06 14:00:00+00:00,FINISHED,1,Newcastle,Nottingham,2,0,2-0
4,416381,2022-08-06 14:00:00+00:00,FINISHED,1,Leeds United,Wolverhampton,2,1,2-1


In [181]:
# pivot
df_table = df.pivot_table(index=['homeTeam.shortName'], columns=['awayTeam.shortName'], fill_value='.', values=['score.fullTime'], aggfunc='first')
# export to csv using function
export_csv(df_table, "table")
# export_excel(df_table, "table")

df_table.head()

Exported table


Unnamed: 0_level_0,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime
awayTeam.shortName,Arsenal,Aston Villa,Bournemouth,Brentford,Brighton Hove,Chelsea,Crystal Palace,Everton,Fulham,Leeds United,Leicester City,Liverpool,Man City,Man United,Newcastle,Nottingham,Southampton,Tottenham,West Ham,Wolverhampton
homeTeam.shortName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Arsenal,.,2-1,.,.,.,.,.,.,2-1,.,4-2,3-2,.,.,.,5-0,.,3-1,.,.
Aston Villa,.,.,.,4-0,.,0-2,.,2-1,.,.,.,.,1-1,3-1,.,.,1-0,.,0-1,.
Bournemouth,0-3,2-0,.,0-0,.,.,.,.,.,.,2-1,.,.,.,.,.,0-1,2-3,.,0-0
Brentford,0-3,.,.,.,2-0,0-0,.,1-1,.,5-2,.,.,.,4-0,.,.,.,.,.,1-1
Brighton Hove,.,.,.,.,.,4-1,.,.,.,1-0,5-2,.,.,.,0-0,0-0,.,0-1,.,.


## Create Matches DataFrame with a row for each team 

In [182]:
points_map = {
    'W': 3,
    'D': 1,
    'L': 0
}

def get_result(score, score_opp):
    if score == score_opp:
        return 'D'
    elif score > score_opp:
        return 'W'
    else:
        return 'L'

drop_cols = ['homeTeam.shortName','awayTeam.shortName', 'score.fullTime.home', 'score.fullTime.away']

# convert each match into two rows (one for each team)

df['H'] = df['homeTeam.shortName']
df['A'] = df['awayTeam.shortName']

df_matches = pd.melt(
    df,
    id_vars=keep_cols,
    value_vars=["H", "A"],
    var_name="home_away",
    value_name="team",
)
df_matches['opponent'] = np.where(
    df_matches['home_away'] == 'H', 
    df_matches['awayTeam.shortName'], 
    df_matches['homeTeam.shortName']
    )

# full time goals
df_matches["goals_scored"] = np.where(
    df_matches["team"] == df_matches["homeTeam.shortName"],
    df_matches["score.fullTime.home"],
    df_matches["score.fullTime.away"],
)
df_matches["goals_against"] = np.where(
    df_matches["team"] != df_matches["homeTeam.shortName"],
    df_matches["score.fullTime.home"],
    df_matches["score.fullTime.away"],
)
df_matches["result"] = np.vectorize(get_result)(
    df_matches["goals_scored"], df_matches["goals_against"]
)
df_matches["points"] = df_matches["result"].map(points_map)


# sort df_matches by id
df_matches = (df_matches
    .drop(drop_cols, axis=1)
    .sort_values(by=['utcDate', 'id'], ascending=True)
    .reset_index(drop=True)
    )

# export to csv using function
export_csv(df_matches, "matches")
# export_excel(df_matches, "matches")

df_matches.head()

Exported matches


Unnamed: 0,id,utcDate,status,matchday,home_away,team,opponent,goals_scored,goals_against,result,points
0,416384,2022-08-05 19:00:00+00:00,FINISHED,1,H,Crystal Palace,Arsenal,0,2,L,0
1,416384,2022-08-05 19:00:00+00:00,FINISHED,1,A,Arsenal,Crystal Palace,2,0,W,3
2,416383,2022-08-06 11:30:00+00:00,FINISHED,1,H,Fulham,Liverpool,2,2,D,1
3,416383,2022-08-06 11:30:00+00:00,FINISHED,1,A,Liverpool,Fulham,2,2,D,1
4,416378,2022-08-06 14:00:00+00:00,FINISHED,1,H,Tottenham,Southampton,4,1,W,3


## Create league table

In [187]:
def league(x):
    results = {
        'played': x['result'].count(),
        'wins': (x['result'] == 'W').sum(),
        'draws': (x['result'] == 'D').sum(),
        'losses': (x['result'] == 'L').sum(),
        'goalsFor': x['goals_scored'].sum(),
        'goalsAgainst': x['goals_against'].sum(),
        'goalDiff': x['goals_scored'].sum() - x['goals_against'].sum(),
        'points': x['points'].sum(),
        'form': ''.join(x['result'].tail(5).tolist()),
        # win %
        'win%': '{:.0%}'.format((x['result'] == 'W').sum() / x['result'].count()),        
        # percentage of points from last 5 games
        'form%': '{:.0%}'.format(x['points'].tail(5).sum() / 15),       
        # goals per game
        'goalsScoredPg': '{:.2f}'.format(x['goals_scored'].sum() / x['result'].count()),
        # goals against per game
        'goalsAgainstPg': '{:.2f}'.format(x['goals_against'].sum() / x['result'].count()),
        # goal difference per game
        'goalDifferencePg': '{:.2f}'.format((x['goals_scored'].sum() - x['goals_against'].sum()) / x['result'].count()),
        # points per game
        'pointsPg': '{:.2f}'.format(x['points'].sum() / x['result'].count()),
        # maximum possible points remainig game for the seaon * 3
        'maxPoints': ((38 - x['result'].count()) * 3) + x['points'].sum(),
        # Expected points, based on points per game
        'expectedPoints': '{:.0f}'.format((x['points'].sum() / x['result'].count()) * (38 - x['result'].count()) + x['points'].sum()),
    }
    return pd.Series(results)

# apply league function
df_league = df_matches.groupby('team').apply(league)
# merge in form
# df_league = df_league.merge(df_form, left_index=True, right_index=True)
# Sort by points and then goal difference
df_league = df_league.sort_values(by=['points', 'goalDiff'], ascending=False)
# add column for position
df_league['position'] = df_league['points'].rank(ascending=False, method='first')
# reformat position to int
df_league['position'] = df_league['position'].astype(int)

# Previous weeks league table
def leaguePrev(x):
    results = {
        'pointsPrev': x['points'].sum(),
        'goalDiffPrev': x['goals_scored'].sum() - x['goals_against'].sum(),
    }
    return pd.Series(results)

# df_matchesPrev where matchday does not equal max
df_matchesPrev = df_matches[df_matches['matchday'] != df_matches['matchday'].max()]

# apply league_prev function
df_leaguePrev = df_matchesPrev.groupby('team').apply(leaguePrev)
# sort by points and then goal difference
df_leaguePrev = df_leaguePrev.sort_values(by=['pointsPrev', 'goalDiffPrev'], ascending=False)
# add column for position
df_leaguePrev['positionPrev'] = df_leaguePrev['pointsPrev'].rank(ascending=False, method='first')
# reformat position to int
df_leaguePrev['positionPrev'] = df_leaguePrev['positionPrev'].astype(int)

# join df_league with df_leaguePrev on index
df_league = df_league.join(df_leaguePrev, how='left')
# loop to calculate position 
df_league['positionChange'] = np.where(
    df_league['position'] > df_league['positionPrev'],
    '▼',
    np.where(
        df_league['position'] < df_league['positionPrev'],
        '▲',
        '=',
    ))
# drop previous position
df_league = df_league.drop(['pointsPrev', 'goalDiffPrev', 'positionPrev'], axis=1)

# export to csv using function
export_csv(df_league, "league")

# print head
df_league.head(20)


Exported league


Unnamed: 0_level_0,played,wins,draws,losses,goalsFor,goalsAgainst,goalDiff,points,form,win%,form%,goalsScoredPg,goalsAgainstPg,goalDifferencePg,pointsPg,maxPoints,expectedPoints,position,positionChange
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Arsenal,13,11,1,1,31,11,20,34,WWDWW,85%,87%,2.38,0.85,1.54,2.62,109,99,1,=
Man City,13,10,2,1,39,12,27,32,WLWWW,77%,80%,3.0,0.92,2.08,2.46,107,94,2,=
Newcastle,14,7,6,1,28,11,17,27,DWWWW,50%,87%,2.0,0.79,1.21,1.93,99,73,3,▲
Tottenham,14,8,2,4,27,18,9,26,WLLWL,57%,40%,1.93,1.29,0.64,1.86,98,71,4,▼
Man United,13,7,2,4,18,19,-1,23,DWDWL,54%,53%,1.38,1.46,-0.08,1.77,98,67,5,=
Brighton Hove,13,6,3,4,22,17,5,21,LDLWW,46%,47%,1.69,1.31,0.38,1.62,96,61,6,▲
Chelsea,13,6,3,4,17,16,1,21,WDDLL,46%,33%,1.31,1.23,0.08,1.62,96,61,7,▼
Liverpool,13,5,4,4,25,16,9,19,WWLLW,38%,60%,1.92,1.23,0.69,1.46,94,56,8,▲
Fulham,14,5,4,5,23,24,-1,19,DWWDL,36%,53%,1.64,1.71,-0.07,1.36,91,52,9,▼
Crystal Palace,13,5,4,4,15,17,-2,19,DWLWW,38%,67%,1.15,1.31,-0.15,1.46,94,56,10,=


In [None]:
# pivot df_match to show team results be match day
df_matchday = df_matches.pivot_table(index=['team'], columns=['matchday'], fill_value='.', values=['result'], aggfunc='first')

df_matchday.head(20)

Unnamed: 0_level_0,result,result,result,result,result,result,result,result,result,result,result,result,result,result
matchday,1,2,3,4,5,6,8,9,10,11,12,13,14,15
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Arsenal,W,W,W,W,W,L,W,W,W,W,.,D,W,W
Aston Villa,L,W,L,L,L,D,W,D,D,L,L,W,L,W
Bournemouth,W,L,L,L,D,W,D,D,W,D,L,L,L,L
Brentford,D,W,L,D,D,W,L,D,L,W,D,L,D,D
Brighton Hove,W,D,W,W,L,W,.,D,L,L,D,L,W,W
Chelsea,W,D,L,W,L,W,.,W,W,W,D,D,L,L
Crystal Palace,L,D,W,L,D,D,.,L,W,D,W,L,W,W
Everton,L,L,D,D,D,D,W,W,L,L,L,W,D,L
Fulham,D,D,W,L,W,L,W,L,L,D,W,W,D,L
Leeds United,W,D,W,L,D,L,.,D,L,L,L,L,W,W
