# API data for premier league current season

## Set environment

In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 500)
from datetime import datetime

from dotenv import load_dotenv

import xlsxwriter

import requests
import json

# current date
date = datetime.now().strftime("%Y%m%d-%H%M")
print(date)


20230209-1119


In [3]:
#Set the current environment
# env = 'development'

# #Load the appropriate .env file
# if env == 'development':
load_dotenv('../../.env')
# elif env == 'production':
#     load_dotenv('.env.prod')

True

### CSV export function

In [4]:
# create function to export to csv
def export_csv(df, filename):
    df.to_csv(f'../../output/{date}-{filename}.csv', encoding='utf-8', index=False)
    print(f"Exported {filename}")

### Excel Export function

In [5]:
# create function to export to excel as a table
def export_excel(df, sheet_name):
    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer = pd.ExcelWriter(f'../../output/{date}-output.xlsx', engine='xlsxwriter')
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer, sheet_name=sheet_name, index=False)
    # Get the xlsxwriter workbook and worksheet objects.
    workbook = writer.book
    worksheet = writer.sheets[sheet_name]
    # format as a table
    worksheet.add_table(0, 0, len(df), len(df.columns)-1, {'columns': [{'header': column} for column in df.columns]})
    writer.save()


In [6]:
# Connect to SQL database with SqlAlchemy  
# import pyodbc
# import sqlalchemy as sql
# from sqlalchemy import create_engine

# server = os.environ['SERVER']
# database = os.environ['DATABASE']
# driver = os.environ['DRIVER']

# # create connection string
# conn_str = f"mssql+pyodbc://{server}/{database}?driver={driver}?trusted_connection=yes"
# # create engine
# engine = sql.create_engine(conn_str)

# # create connection
# conn = engine.connect()


## API call

### Key columns to keep

In [7]:
# keep columns
keep_cols = [
    'id',
    'utcDate',
    'status',
    'matchday',
    # 'season.id',
    # 'season.currentMatchday',
    # 'homeTeam.id',
    # 'homeTeam.name',
    'homeTeam.shortName',
    # 'awayTeam.id',
    # 'awayTeam.name',
    'awayTeam.shortName',
    # 'score.winner',
    'score.fullTime.home',
    'score.fullTime.away',
    # 'score.halfTime.home',
    # 'score.halfTime.away'
]

### API data call

Add scoreline to dataframe

In [8]:
# All of premier league history since 1992-08-15
# uri = 'https://api.football-data.org/v2/competitions/PL/matches?dateFrom=2021-08-01'


In [9]:

api_key = os.environ['API_AUTH_KEY']


In [10]:

uri = 'http://api.football-data.org/v4/competitions/PL/matches?status=FINISHED&season=2022'
headers = { 'X-Auth-Token':  api_key}

response = requests.get(uri, headers=headers)
data = response.json()
df = pd.json_normalize(data, record_path = ['matches'])
df = df[keep_cols]
df['utcDate'] = pd.to_datetime(df['utcDate'])
df['score.fullTime'] = df['score.fullTime.home'].astype(str) + '-' + df['score.fullTime.away'].astype(str)
print(df.shape)
df.head()


(210, 9)


Unnamed: 0,id,utcDate,status,matchday,homeTeam.shortName,awayTeam.shortName,score.fullTime.home,score.fullTime.away,score.fullTime
0,416384,2022-08-05 19:00:00+00:00,FINISHED,1,Crystal Palace,Arsenal,0,2,0-2
1,416383,2022-08-06 11:30:00+00:00,FINISHED,1,Fulham,Liverpool,2,2,2-2
2,416378,2022-08-06 14:00:00+00:00,FINISHED,1,Tottenham,Southampton,4,1,4-1
3,416379,2022-08-06 14:00:00+00:00,FINISHED,1,Newcastle,Nottingham,2,0,2-0
4,416381,2022-08-06 14:00:00+00:00,FINISHED,1,Leeds United,Wolverhampton,2,1,2-1


## Pivot results 

In [11]:
# pivot
df_table = df.pivot_table(index=['homeTeam.shortName'], columns=['awayTeam.shortName'], fill_value='.', values=['score.fullTime'], aggfunc='first')

df_table.head(20)

Unnamed: 0_level_0,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime,score.fullTime
awayTeam.shortName,Arsenal,Aston Villa,Bournemouth,Brentford,Brighton Hove,Chelsea,Crystal Palace,Everton,Fulham,Leeds United,Leicester City,Liverpool,Man City,Man United,Newcastle,Nottingham,Southampton,Tottenham,West Ham,Wolverhampton
homeTeam.shortName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Arsenal,.,2-1,.,.,.,.,.,.,2-1,.,4-2,3-2,.,3-2,0-0,5-0,.,3-1,3-1,.
Aston Villa,.,.,.,4-0,.,0-2,.,2-1,.,2-1,2-4,1-3,1-1,3-1,.,.,1-0,.,0-1,1-1
Bournemouth,0-3,2-0,.,0-0,.,.,0-2,3-0,.,.,2-1,.,.,.,.,1-1,0-1,2-3,.,0-0
Brentford,0-3,.,2-0,.,2-0,0-0,.,1-1,.,5-2,.,3-1,.,4-0,.,.,3-0,2-2,.,1-1
Brighton Hove,2-4,1-2,1-0,.,.,4-1,.,.,.,1-0,5-2,3-0,.,.,0-0,0-0,.,0-1,.,.
Chelsea,0-1,.,2-0,.,.,.,1-0,.,0-0,.,2-1,.,0-1,1-1,.,.,.,2-2,2-1,3-0
Crystal Palace,0-2,3-1,.,1-1,.,1-2,.,.,0-3,2-1,.,.,.,1-1,0-0,.,1-0,0-4,.,2-1
Everton,1-0,.,.,.,1-4,0-1,3-0,.,.,.,0-2,0-0,.,1-2,.,1-1,1-2,.,1-0,1-2
Fulham,.,3-0,2-2,3-2,2-1,2-1,.,0-0,.,.,.,2-2,.,1-2,1-4,.,2-1,0-1,.,.
Leeds United,0-1,0-0,4-3,0-0,.,3-0,.,1-1,2-3,.,.,.,1-3,.,.,.,.,.,2-2,2-1


In [12]:
# rename columns awayTeam.shortName to null

# df_table.columns = df_table.columns.droplevel(0)
# df_table.columns.name = None

# reset index
df_table2 = df_table.reset_index()

# drop first row
df_table2.columns = df_table2.columns.droplevel(0)
# rename column 2 to home team
df_table2.rename(columns={ df_table2.columns[0]: "homeTeam"}, inplace = True)
# drop index column



# export to csv using function
export_csv(df_table2, "table")
# export_excel(df_table, "table")


df_table2.head()

Exported table


awayTeam.shortName,homeTeam,Arsenal,Aston Villa,Bournemouth,Brentford,Brighton Hove,Chelsea,Crystal Palace,Everton,Fulham,Leeds United,Leicester City,Liverpool,Man City,Man United,Newcastle,Nottingham,Southampton,Tottenham,West Ham,Wolverhampton
0,Arsenal,.,2-1,.,.,.,.,.,.,2-1,.,4-2,3-2,.,3-2,0-0,5-0,.,3-1,3-1,.
1,Aston Villa,.,.,.,4-0,.,0-2,.,2-1,.,2-1,2-4,1-3,1-1,3-1,.,.,1-0,.,0-1,1-1
2,Bournemouth,0-3,2-0,.,0-0,.,.,0-2,3-0,.,.,2-1,.,.,.,.,1-1,0-1,2-3,.,0-0
3,Brentford,0-3,.,2-0,.,2-0,0-0,.,1-1,.,5-2,.,3-1,.,4-0,.,.,3-0,2-2,.,1-1
4,Brighton Hove,2-4,1-2,1-0,.,.,4-1,.,.,.,1-0,5-2,3-0,.,.,0-0,0-0,.,0-1,.,.


In [13]:
print(df.shape)

(210, 9)


## Create Matches DataFrame with a row for each team 

run function 

In [14]:
points_map = {
    'W': 3,
    'D': 1,
    'L': 0
}

def get_result(score, score_opp):
    if score == score_opp:
        return 'D'
    elif score > score_opp:
        return 'W'
    else:
        return 'L'

drop_cols = ['homeTeam.shortName','awayTeam.shortName']

# convert each match into two rows (one for each team)

df['H'] = df['homeTeam.shortName']
df['A'] = df['awayTeam.shortName']

df_matches = pd.melt(
    df,
    id_vars=keep_cols,
    value_vars=["H", "A"],
    var_name="home_away",
    value_name="team",
)

print(df_matches.shape)
df_matches.head()


(420, 10)


Unnamed: 0,id,utcDate,status,matchday,homeTeam.shortName,awayTeam.shortName,score.fullTime.home,score.fullTime.away,home_away,team
0,416384,2022-08-05 19:00:00+00:00,FINISHED,1,Crystal Palace,Arsenal,0,2,H,Crystal Palace
1,416383,2022-08-06 11:30:00+00:00,FINISHED,1,Fulham,Liverpool,2,2,H,Fulham
2,416378,2022-08-06 14:00:00+00:00,FINISHED,1,Tottenham,Southampton,4,1,H,Tottenham
3,416379,2022-08-06 14:00:00+00:00,FINISHED,1,Newcastle,Nottingham,2,0,H,Newcastle
4,416381,2022-08-06 14:00:00+00:00,FINISHED,1,Leeds United,Wolverhampton,2,1,H,Leeds United


In [15]:
df_matches['opponent'] = np.where(
    df_matches['home_away'] == 'H', 
    df_matches['awayTeam.shortName'], 
    df_matches['homeTeam.shortName']
    )
# add column for full time score
df_matches['score.fullTime'] = df_matches['score.fullTime.home'].astype(str) + '-' + df_matches['score.fullTime.away'].astype(str)

# full time goals
df_matches["goals_scored"] = np.where(
    df_matches["team"] == df_matches["homeTeam.shortName"],
    df_matches["score.fullTime.home"],
    df_matches["score.fullTime.away"],
)
df_matches["goals_against"] = np.where(
    df_matches["team"] != df_matches["homeTeam.shortName"],
    df_matches["score.fullTime.home"],
    df_matches["score.fullTime.away"],
)
df_matches["result"] = np.vectorize(get_result)(
    df_matches["goals_scored"], df_matches["goals_against"]
)
df_matches["points"] = df_matches["result"].map(points_map)


# sort df_matches by id
df_matches = (df_matches
    .drop(drop_cols, axis=1)
    .sort_values(by=['utcDate', 'id'], ascending=True)
    .reset_index(drop=True)
    )

# export to csv using function
export_csv(df_matches, "matches")
# export_excel(df_matches, "matches")

df_matches.head()

Exported matches


Unnamed: 0,id,utcDate,status,matchday,score.fullTime.home,score.fullTime.away,home_away,team,opponent,score.fullTime,goals_scored,goals_against,result,points
0,416384,2022-08-05 19:00:00+00:00,FINISHED,1,0,2,H,Crystal Palace,Arsenal,0-2,0,2,L,0
1,416384,2022-08-05 19:00:00+00:00,FINISHED,1,0,2,A,Arsenal,Crystal Palace,0-2,2,0,W,3
2,416383,2022-08-06 11:30:00+00:00,FINISHED,1,2,2,H,Fulham,Liverpool,2-2,2,2,D,1
3,416383,2022-08-06 11:30:00+00:00,FINISHED,1,2,2,A,Liverpool,Fulham,2-2,2,2,D,1
4,416378,2022-08-06 14:00:00+00:00,FINISHED,1,4,1,H,Tottenham,Southampton,4-1,4,1,W,3


## Create league table

In [16]:
# create a dictionary of logos
logosDict = {
    'Arsenal': 'https://upload.wikimedia.org/wikipedia/en/5/53/Arsenal_FC.svg',
    'Aston Villa': 'https://upload.wikimedia.org/wikipedia/en/f/f9/Aston_Villa_FC_crest_%282016%29.svg',
    'Brighton': 'https://upload.wikimedia.org/wikipedia/en/f/fd/Brighton_%26_Hove_Albion_logo.svg',
    'Brentford': 'https://upload.wikimedia.org/wikipedia/en/2/2a/Brentford_FC_crest.svg',
    'Burnley': 'https://upload.wikimedia.org/wikipedia/en/0/02/Burnley_FC_badge.svg',
    'Bournemouth': 'https://upload.wikimedia.org/wikipedia/en/e/e5/AFC_Bournemouth_%282013%29.svg',
    'Chelsea': 'https://upload.wikimedia.org/wikipedia/en/c/cc/Chelsea_FC.svg',
    'Crystal Palace': 'https://upload.wikimedia.org/wikipedia/en/a/a2/Crystal_Palace_FC_logo_%282022%29.svg',
    'Everton': 'https://upload.wikimedia.org/wikipedia/en/7/7c/Everton_FC_logo.svg',
    'Fulham': 'https://upload.wikimedia.org/wikipedia/sco/e/eb/Fulham_FC_%28shield%29.svg',
    'Leeds': 'https://upload.wikimedia.org/wikipedia/en/5/54/Leeds_United_F.C._logo.svg',
    'Leicester': 'https://upload.wikimedia.org/wikipedia/en/2/2d/Leicester_City_crest.svg',
    'Liverpool': 'https://upload.wikimedia.org/wikipedia/en/0/0c/Liverpool_FC.svg',
    'Man City': 'https://upload.wikimedia.org/wikipedia/en/e/eb/Manchester_City_FC_badge.svg',
    'Man United': 'https://upload.wikimedia.org/wikipedia/sco/7/7a/Manchester_United_FC_crest.svg',
    'Newcastle': 'https://upload.wikimedia.org/wikipedia/en/5/56/Newcastle_United_Logo.svg',
    'Nottingham': 'https://upload.wikimedia.org/wikipedia/en/e/e5/Nottingham_Forest_F.C._logo.svg',
    # 'Sheffield United': 'https://upload.wikimedia.org/wikipedia/en/1/12/Sheffield_United_FC_logo.svg',
    'Southampton': 'https://upload.wikimedia.org/wikipedia/en/c/c9/FC_Southampton.svg',
    'Tottenham': 'https://upload.wikimedia.org/wikipedia/en/b/b4/Tottenham_Hotspur.svg',
    # 'West Brom': 'https://upload.wikimedia.org/wikipedia/en/8/8b/West_Bromwich_Albion.svg',
    'West Ham': 'https://upload.wikimedia.org/wikipedia/en/c/c2/West_Ham_United_FC_logo.svg',
    'Wolves': 'https://upload.wikimedia.org/wikipedia/en/f/fc/Wolverhampton_Wanderers.svg'
}

In [17]:
def league(x):
    results = {
        'played': x['result'].count(),
        'won': (x['result'] == 'W').sum(),
        'drawn': (x['result'] == 'D').sum(),
        'lost': (x['result'] == 'L').sum(),
        'goalsFor': x['goals_scored'].sum(),
        'goalsAgainst': x['goals_against'].sum(),
        'goalDiff': x['goals_scored'].sum() - x['goals_against'].sum(),
        'points': x['points'].sum(),
        'form': ''.join(x['result'].tail(5).tolist()),
        # ANALYSIS  COLUMNS

        # win %
        'win%': '{:.0%}'.format((x['result'] == 'W').sum() / x['result'].count()),        
        # percentage of points from last 5 games
        'form%': '{:.0%}'.format(x['points'].tail(5).sum() / 15),       
        # goals per game
        'goalsScoredPg': '{:.2f}'.format(x['goals_scored'].sum() / x['result'].count()),
        # goals against per game
        'goalsAgainstPg': '{:.2f}'.format(x['goals_against'].sum() / x['result'].count()),
        # goal difference per game
        'goalDifferencePg': '{:.2f}'.format((x['goals_scored'].sum() - x['goals_against'].sum()) / x['result'].count()),
        # clean sheets
        'cleanSheets': (x['goals_against'] == 0).sum(),
        # points per game
        'pointsPg': '{:.2f}'.format(x['points'].sum() / x['result'].count()),
        # Home points per game
        'homePointsPg': '{:.2f}'.format(x[x['home_away'] == 'H']['points'].sum() / x[x['home_away'] == 'H']['result'].count()),
        # Away points per game
        'awayPointsPg': '{:.2f}'.format(x[x['home_away'] == 'A']['points'].sum() / x[x['home_away'] == 'A']['result'].count()),
        # Score away against Newcastle
        'NewcastleHome': x[(x['opponent'] == 'Newcastle') & (x['home_away'] == 'A')]['score.fullTime'].tolist(),
        # Score at home against Newcastle
        'NewcastleAway': x[(x['opponent'] == 'Newcastle') & (x['home_away'] == 'H')]['score.fullTime'].tolist(),
        # maximum possible points remainig game for the seaon * 3
        'maxPoints': ((38 - x['result'].count()) * 3) + x['points'].sum(),
        # Expected points, based on points per game
        'expectedPoints': '{:.0f}'.format((x['points'].sum() / x['result'].count()) * (38 - x['result'].count()) + x['points'].sum()),


    }
    return pd.Series(results)

# apply league function
df_league = df_matches.groupby('team').apply(league)
# Sort by points and then goal difference
df_league = df_league.sort_values(by=['points', 'goalDiff'], ascending=False)

##### ADD IN POSITION COLUMN

# add column for position
df_league['position'] = df_league['points'].rank(ascending=False, method='first')
# reformat position to int
df_league['position'] = df_league['position'].astype(int)


####### ADD IN PREVIOUS WEEKS DATA
# To calculate position change

# Function for Previous weeks league table for points and goal difference only
def leaguePrev(x):
    results = {
        'pointsPrev': x['points'].sum(),
        'goalDiffPrev': x['goals_scored'].sum() - x['goals_against'].sum(),
    }
    return pd.Series(results)

# df_matchesPrev where matchday does not equal max
df_matchesPrev = df_matches[df_matches['matchday'] != df_matches['matchday'].max()]

# apply league_prev function
df_leaguePrev = df_matchesPrev.groupby('team').apply(leaguePrev)
# sort by points and then goal difference
df_leaguePrev = df_leaguePrev.sort_values(by=['pointsPrev', 'goalDiffPrev'], ascending=False)
# add column for position
df_leaguePrev['positionPrev'] = df_leaguePrev['pointsPrev'].rank(ascending=False, method='first')
# reformat position to int
df_leaguePrev['positionPrev'] = df_leaguePrev['positionPrev'].astype(int)

# join df_league with df_leaguePrev on index
df_league = df_league.join(df_leaguePrev, how='left')

# add in position change symbol
df_league['positionChange'] = np.where(
    df_league['position'] > df_league['positionPrev'],
    '▼',
    np.where(
        df_league['position'] < df_league['positionPrev'],
        '▲',
        '=',
    ))
# drop previous position
df_league = df_league.drop(['pointsPrev', 'goalDiffPrev', 'positionPrev'], axis=1)

# move team from index to column
df_league = df_league.reset_index()

##### ADD IN RESULTS VS NEWCASTLE

# df to show points taken by Newcastle against each opponent
df_ncl = df_matches[df_matches['team'] == 'Newcastle'] 
# group by opponent and sum points and games
df_ncl = df_ncl.groupby('opponent').agg({'points': 'sum', 'matchday': 'count'})
# calculate % of points taken and format as a percentage
df_ncl['%Taken'] = df_ncl['points'] / (df_ncl['matchday'] * 3)
df_ncl['%Taken'] = df_ncl['%Taken'].apply(lambda x: '{:.0%}'.format(x))
# Remaining points to be taken from each opponent
df_ncl['pointsRemaining'] = ((2 - df_ncl['matchday']) * 3)
# drop matchday column
df_ncl = df_ncl.drop(['matchday'], axis=1)
# rename points column to pointsTaken
df_ncl = df_ncl.rename(columns={'points': 'pointsTaken'})
# remove index
df_ncl = df_ncl.reset_index()

# Join with df_league
# Add df_ncl to df_league on left on team, right on opponent
df_league = df_league.merge(df_ncl, left_on='team', right_on='opponent', how='left')
# drop opponent column
df_league = df_league.drop(['opponent'], axis=1)

# add logos for each team

# add logo column from logoDict to df_league and limit to 30px
df_league['logo'] = df_league['team'].map(logosDict)
# render logo in html format with 30px height
df_league['logo'] = df_league['logo'].apply(lambda x: '<img src="{}" height="30" alt="logo">'.format(x))


##### FORMAT THE DATA

# remove [] from newcastleHome and NewcastleAway
df_league['NewcastleHome'] = df_league['NewcastleHome'].str[0]
df_league['NewcastleAway'] = df_league['NewcastleAway'].str[0]
# replace nan with -
df_league = df_league.fillna('-')

# Reorder columns
df_league = df_league[['position', 'positionChange', 'logo',  'team', 'played', 'won', 'drawn', 'lost', 'goalsFor', 'goalsAgainst', 'goalDiff',  'points', 'form',
'win%', 'form%', 'pointsPg', 'homePointsPg', 'awayPointsPg', 'goalDifferencePg', 'cleanSheets', 'NewcastleHome', 'NewcastleAway', 'pointsTaken', 'pointsRemaining', '%Taken', 'maxPoints', 'expectedPoints']]

df_league.to_html(f'../../output/{date}-table.html', escape=False, index=False)

# export to excel using function
export_excel(df_league, 'table')

df_league.head(20)


Unnamed: 0,position,positionChange,logo,team,played,won,drawn,lost,goalsFor,goalsAgainst,goalDiff,points,form,win%,form%,pointsPg,homePointsPg,awayPointsPg,goalDifferencePg,cleanSheets,NewcastleHome,NewcastleAway,pointsTaken,pointsRemaining,%Taken,maxPoints,expectedPoints
0,1,=,"<img src=""https://upload.wikimedia.org/wikiped...",Arsenal,20,16,2,2,45,17,28,50,WDWWL,80%,67%,2.5,2.78,2.27,1.4,9,-,0-0,1.0,3.0,33%,104,95
1,2,=,"<img src=""https://upload.wikimedia.org/wikiped...",Man City,21,14,3,4,53,21,32,45,WLWWL,67%,60%,2.14,2.55,1.7,1.52,8,3-3,-,1.0,3.0,33%,96,81
2,3,=,"<img src=""https://upload.wikimedia.org/wikiped...",Man United,22,13,4,5,36,28,8,43,WDLWD,59%,53%,1.95,2.36,1.55,0.36,8,-,0-0,1.0,3.0,33%,91,74
3,4,=,"<img src=""https://upload.wikimedia.org/wikiped...",Newcastle,21,10,10,1,34,12,22,40,DDWDD,48%,47%,1.9,2.09,1.7,1.05,12,-,-,-,-,-,91,72
4,5,=,"<img src=""https://upload.wikimedia.org/wikiped...",Tottenham,22,12,3,7,41,31,10,39,WLLWW,55%,60%,1.77,1.91,1.64,0.45,7,-,1-2,3.0,3.0,100%,87,67
5,6,=,"<img src=""nan"" height=""30"" alt=""logo"">",Brighton Hove,20,10,4,6,38,27,11,34,LWWDW,50%,67%,1.7,1.7,1.7,0.55,6,-,0-0,1.0,3.0,33%,88,65
6,7,▲,"<img src=""https://upload.wikimedia.org/wikiped...",Brentford,21,8,9,4,35,28,7,33,WWWDW,38%,87%,1.57,2.0,1.1,0.33,8,5-1,-,3.0,3.0,100%,84,60
7,8,▼,"<img src=""https://upload.wikimedia.org/wikiped...",Fulham,22,9,5,8,32,30,2,32,WWLLD,41%,47%,1.45,1.64,1.27,0.09,6,1-0,1-4,6.0,0.0,100%,80,55
8,9,▲,"<img src=""https://upload.wikimedia.org/wikiped...",Chelsea,21,8,6,7,22,21,1,30,LLWDD,38%,33%,1.43,1.8,1.09,0.05,8,1-0,-,3.0,3.0,100%,81,54
9,10,▼,"<img src=""https://upload.wikimedia.org/wikiped...",Liverpool,20,8,5,7,34,28,6,29,WLLDL,40%,27%,1.45,2.1,0.8,0.3,5,-,2-1,0.0,3.0,0%,83,55


In [18]:
# Add df_ncl to df_league on left on team, right on opponent
df_league = df_league.merge(df_ncl, left_on='team', right_on='opponent', how='left')
# drop opponent column
df_league = df_league.drop(['opponent'], axis=1)
df_league.head(20)

Unnamed: 0,position,positionChange,logo,team,played,won,drawn,lost,goalsFor,goalsAgainst,goalDiff,points,form,win%,form%,pointsPg,homePointsPg,awayPointsPg,goalDifferencePg,cleanSheets,NewcastleHome,NewcastleAway,pointsTaken_x,pointsRemaining_x,%Taken_x,maxPoints,expectedPoints,pointsTaken_y,%Taken_y,pointsRemaining_y
0,1,=,"<img src=""https://upload.wikimedia.org/wikiped...",Arsenal,20,16,2,2,45,17,28,50,WDWWL,80%,67%,2.5,2.78,2.27,1.4,9,-,0-0,1.0,3.0,33%,104,95,1.0,33%,3.0
1,2,=,"<img src=""https://upload.wikimedia.org/wikiped...",Man City,21,14,3,4,53,21,32,45,WLWWL,67%,60%,2.14,2.55,1.7,1.52,8,3-3,-,1.0,3.0,33%,96,81,1.0,33%,3.0
2,3,=,"<img src=""https://upload.wikimedia.org/wikiped...",Man United,22,13,4,5,36,28,8,43,WDLWD,59%,53%,1.95,2.36,1.55,0.36,8,-,0-0,1.0,3.0,33%,91,74,1.0,33%,3.0
3,4,=,"<img src=""https://upload.wikimedia.org/wikiped...",Newcastle,21,10,10,1,34,12,22,40,DDWDD,48%,47%,1.9,2.09,1.7,1.05,12,-,-,-,-,-,91,72,,,
4,5,=,"<img src=""https://upload.wikimedia.org/wikiped...",Tottenham,22,12,3,7,41,31,10,39,WLLWW,55%,60%,1.77,1.91,1.64,0.45,7,-,1-2,3.0,3.0,100%,87,67,3.0,100%,3.0
5,6,=,"<img src=""nan"" height=""30"" alt=""logo"">",Brighton Hove,20,10,4,6,38,27,11,34,LWWDW,50%,67%,1.7,1.7,1.7,0.55,6,-,0-0,1.0,3.0,33%,88,65,1.0,33%,3.0
6,7,▲,"<img src=""https://upload.wikimedia.org/wikiped...",Brentford,21,8,9,4,35,28,7,33,WWWDW,38%,87%,1.57,2.0,1.1,0.33,8,5-1,-,3.0,3.0,100%,84,60,3.0,100%,3.0
7,8,▼,"<img src=""https://upload.wikimedia.org/wikiped...",Fulham,22,9,5,8,32,30,2,32,WWLLD,41%,47%,1.45,1.64,1.27,0.09,6,1-0,1-4,6.0,0.0,100%,80,55,6.0,100%,0.0
8,9,▲,"<img src=""https://upload.wikimedia.org/wikiped...",Chelsea,21,8,6,7,22,21,1,30,LLWDD,38%,33%,1.43,1.8,1.09,0.05,8,1-0,-,3.0,3.0,100%,81,54,3.0,100%,3.0
9,10,▼,"<img src=""https://upload.wikimedia.org/wikiped...",Liverpool,20,8,5,7,34,28,6,29,WLLDL,40%,27%,1.45,2.1,0.8,0.3,5,-,2-1,0.0,3.0,0%,83,55,0.0,0%,3.0


## Form by match day

In [19]:
# pivot df_match to show team results be match day
df_matchday = df_matches.pivot_table(index=['team'], columns=['matchday'], fill_value='.', values=['result'], aggfunc='first')

df_matchday.head(20)


Unnamed: 0_level_0,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result,result
matchday,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Arsenal,W,W,W,W,W,L,.,W,W,W,W,.,D,W,W,W,W,W,D,W,W,L
Aston Villa,L,W,L,L,L,D,.,W,D,D,L,L,W,L,W,W,L,W,D,W,W,L
Bournemouth,W,L,L,L,D,W,.,D,D,W,D,L,L,L,L,W,L,L,L,L,D,L
Brentford,D,W,L,D,D,W,.,L,D,L,W,D,L,D,D,W,D,W,W,W,D,W
Brighton Hove,W,D,W,W,L,W,.,.,D,L,L,D,L,W,W,L,W,L,W,W,D,W
Chelsea,W,D,L,W,L,W,L,.,W,W,W,D,D,L,L,L,W,D,L,W,D,D
Crystal Palace,L,D,W,L,D,D,D,.,L,W,D,W,L,W,W,L,L,W,L,L,D,L
Everton,L,L,D,D,D,D,.,W,W,L,L,L,W,D,L,L,L,D,L,L,L,W
Fulham,D,D,W,L,W,L,W,W,L,L,D,W,W,D,L,L,W,W,W,L,L,D
Leeds United,W,D,W,L,D,L,.,D,D,L,L,L,L,W,W,L,L,D,D,L,D,L


In [20]:
# add in longest unbeaten run

# function to calculate longest unbeaten consecutive results
def longestUnbeaten(x):
    # calculate length of longest unbeaten run
    results = {
        'longestUnbeaten': len(max(''.join(x['result']).split('L'), key=len)),
    }
    return pd.Series(results)

# apply longestUnbeaten function
df_streaks = df_matches.groupby('team').apply(longestUnbeaten)
# sort by longest unbeaten run
df_streaks = df_streaks.sort_values(by=['longestUnbeaten'], ascending=False)

df_streaks.head(20)

Unnamed: 0_level_0,longestUnbeaten
team,Unnamed: 1_level_1
Newcastle,16
Arsenal,13
Brentford,9
Man City,9
Tottenham,7
Bournemouth,6
Chelsea,6
Everton,6
Man United,6
Nottingham,5


In [21]:
# calculate number of games since last loss
def gamesSinceLoss(x):
    # calculate length of longest unbeaten run
    results = {
        'gamesSinceLoss': len(''.join(x['result']).split('L')[-1]),
    }
    return pd.Series(results)
# apply gamesSinceLoss function
df_streaks2 = df_matches.groupby('team').apply(gamesSinceLoss)
# sort by longest unbeaten run
df_streaks2 = df_streaks2.sort_values(by=['gamesSinceLoss'], ascending=False)
df_streaks2.head(20)

Unnamed: 0_level_0,gamesSinceLoss
team,Unnamed: 1_level_1
Newcastle,16
Brentford,9
Nottingham,5
Brighton Hove,4
Chelsea,3
Leicester City,2
Man United,2
West Ham,2
Tottenham,2
Wolverhampton,1
