# Scraping Premier League stats from FBref

---

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import jenkspy
import matplotlib.pyplot as plt

%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
today = pd.to_datetime("today")

### Premier League clubs

In [4]:
clubs = [
    {"name": "Burnley", "code": "BUR", "country": "England", "id": "943e8050"},
    {
        "name": "Manchester United",
        "code": "MUN",
        "country": "England",
        "id": "19538871",
    },
    {"name": "Manchester City", "code": "MCI", "country": "England", "id": "b8fd03ef"},
    {"name": "Aston Villa", "code": "AVL", "country": "England", "id": "8602292d"},
    {"name": "Fulham", "code": "FUL", "country": "England", "id": "fd962109"},
    {"name": "Arsenal", "code": "ARS", "country": "England", "id": "18bb7c10"},
    {"name": "Crystal Palace", "code": "CRY", "country": "England", "id": "47c64c55"},
    {"name": "Southampton", "code": "SOU", "country": "England", "id": "33c895d4"},
    {"name": "Liverpool", "code": "LIV", "country": "England", "id": "e87167c6"},
    {"name": "Leeds United", "code": "LEE", "country": "England", "id": "5bfb9659"},
    {"name": "West Ham United", "code": "WHU", "country": "England", "id": "52d65cea"},
    {"name": "Newcastle United", "code": "NEW", "country": "England", "id": "b2b47a98"},
    {
        "name": "West Bromwich Albion",
        "code": "WBA",
        "country": "England",
        "id": "60c6b05f",
    },
    {"name": "Leicester City", "code": "LEI", "country": "England", "id": "a2d435b3"},
    {
        "name": "Tottenham Hotspur",
        "code": "TOT",
        "country": "England",
        "id": "361ca564",
    },
    {"name": "Everton", "code": "EVE", "country": "England", "id": "c4989550"},
    {"name": "Sheffield United", "code": "SHU", "country": "England", "id": "1df6b87e"},
    {
        "name": "Wolverhampton Wanderers",
        "code": "WOL",
        "country": "England",
        "id": "8cec06e1",
    },
    {
        "name": "Brighton & Hove Albion",
        "code": "BHA",
        "country": "England",
        "id": "d07537b9",
    },
    {"name": "Chelsea", "code": "CHE", "country": "England", "id": "a6a4e67d"},
]

In [5]:
squad_base = "https://fbref.com/en/squads/"

In [6]:
clubs_df = pd.DataFrame(clubs)

In [7]:
clubs_df["url"] = squad_base + "/" + clubs_df["id"]

In [8]:
clubs_df.to_csv("output/clubs_list.csv", index=False)

In [9]:
clubs_df.head()

Unnamed: 0,name,code,country,id,url
0,Burnley,BUR,England,943e8050,https://fbref.com/en/squads//943e8050
1,Manchester United,MUN,England,19538871,https://fbref.com/en/squads//19538871
2,Manchester City,MCI,England,b8fd03ef,https://fbref.com/en/squads//b8fd03ef
3,Aston Villa,AVL,England,8602292d,https://fbref.com/en/squads//8602292d
4,Fulham,FUL,England,fd962109,https://fbref.com/en/squads//fd962109


---

## League standings overall

In [10]:
url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [11]:
tables = pd.read_html(url)

In [12]:
df_overall = tables[0]

In [13]:
df_overall.rename(
    columns={
        "Rk": "rank",
        "Squad": "name",
        "MP": "matches",
        "W": "wins",
        "D": "draws",
        "L": "losses",
        "GF": "goals_for",
        "GA": "goals_against",
        "GD": "goal_diff",
        "Pts": "points",
        "xG": "expected_goals_for",
        "xGA": "expected_goals_against",
        "xGD": "expected_goal_diff",
        "xGD/90": "expected_goals_diff_90mins",
        "Last 5": "last_five",
        "Attendance": "attendance",
        "Top Team Scorer": "top_scorer",
        "Goalkeeper": "goalkeeper",
        "Notes": "notes",
    },
    inplace=True,
)

In [14]:
df_overall.drop(["notes"], axis=1, inplace=True)

In [15]:
df_overall.head()

Unnamed: 0,rank,name,matches,wins,draws,losses,goals_for,goals_against,goal_diff,points,expected_goals_for,expected_goals_against,expected_goal_diff,expected_goals_diff_90mins,last_five,attendance,top_scorer,goalkeeper
0,1,Manchester City,30,22,5,3,64,21,43,71,58.1,23.6,34.5,1.15,W W L W W,,İlkay Gündoğan - 12,Ederson
1,2,Manchester Utd,29,16,9,4,56,32,24,57,45.5,34.1,11.4,0.39,W D D W W,,Bruno Fernandes - 16,David de Gea
2,3,Leicester City,29,17,5,7,53,32,21,56,44.0,34.5,9.5,0.33,W L D W W,,Jamie Vardy - 12,Kasper Schmeichel
3,4,Chelsea,29,14,9,6,44,25,19,51,46.0,24.5,21.5,0.74,D D W W D,143.0,"Tammy Abraham, Jorginho - 6",Edouard Mendy
4,5,West Ham,29,14,7,8,45,35,10,49,40.3,33.8,6.5,0.23,W L W L D,133.0,Tomáš Souček - 9,Łukasz Fabiański


In [16]:
df_overall["updated"] = today

In [17]:
df_overall.to_csv("output/league_standings.csv", index=False)

---

### Club performance

In [18]:
df_performance = tables[2]

In [19]:
df_performance.columns = [col[1] for col in df_performance.columns]

In [20]:
df_performance.head()

Unnamed: 0,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,xG,npxG,xA,npxG+xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA.1
0,Arsenal,29,26.8,53.1,29,319,2610,29.0,38,27,32,6,6,42,5,1.31,0.93,2.24,1.1,2.03,41.0,36.4,26.7,63.1,1.41,0.92,2.33,1.26,2.18
1,Aston Villa,21,25.9,49.0,28,308,2520,28.0,37,29,34,3,4,51,2,1.32,1.04,2.36,1.21,2.25,39.9,36.9,28.5,65.4,1.43,1.02,2.44,1.32,2.34
2,Brighton,26,26.4,51.9,29,319,2610,29.0,31,21,26,5,8,37,3,1.07,0.72,1.79,0.9,1.62,40.7,34.6,25.9,60.5,1.4,0.89,2.3,1.19,2.09
3,Burnley,24,29.0,41.0,29,319,2610,29.0,21,11,20,1,1,39,0,0.72,0.38,1.1,0.69,1.07,26.9,26.2,18.2,44.4,0.93,0.63,1.56,0.9,1.53
4,Chelsea,27,26.7,61.6,29,319,2610,29.0,42,28,35,7,9,35,1,1.45,0.97,2.41,1.21,2.17,46.0,39.1,29.6,68.8,1.59,1.02,2.61,1.35,2.37


In [21]:
df_performance["updated"] = today

In [22]:
df_performance.to_csv("output/club_performance.csv", index=False)

---

### Club standard stats

In [23]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[0]).assign(clubname=n))

df = pd.concat(df_list)

In [24]:
df_clubstats = df.copy()

In [25]:
df_clubstats.columns = ["_".join(col).strip() for col in df_clubstats.columns.values]

In [26]:
df_clubstats.columns = (
    df_clubstats.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [27]:
df_clubstats.rename(
    columns={
        "performance_ast": "assists",
        "performance_crdr": "red_cards",
        "performance_crdy": "yellow_cards",
        "performance_g_pk": "non_penalty_goals",
        "performance_gls": "goals",
        "performance_pk": "penalty_kicks",
        "performance_pkatt": "penalty_kick_attempts",
        "playing_time_90s": "playing_time_90",
        "playing_time_mp": "matches_played",
        "playing_time_min": "minutes",
        "playing_time_starts": "starts",
        "unnamed:_0_level_0_player": "player",
        "unnamed:_1_level_0_nation": "nation",
        "unnamed:_20_level_0_matches": "matches",
        "unnamed:_29_level_0_matches": "matches2",
        "unnamed:_2_level_0_pos": "position",
        "unnamed:_3_level_0_age": "age",
        "clubname_": "clubname",
    },
    inplace=True,
)

In [28]:
df_clubstats_slim = df_clubstats[
    [
        "player",
        "nation",
        "clubname",
        "age",
        "position",
        "starts",
        "minutes",
        "matches_played",
        "playing_time_90",
        "goals",
        "assists",
        "penalty_kick_attempts",
        "penalty_kicks",
        "non_penalty_goals",
        "yellow_cards",
        "red_cards",
    ]
].copy()

In [29]:
df_clubstats_slim["nation"] = df_clubstats_slim["nation"].str.split(
    " ", n=1, expand=True
)

In [30]:
df_clubstats_slim.head()

Unnamed: 0,player,nation,clubname,age,position,starts,minutes,matches_played,playing_time_90,goals,assists,penalty_kick_attempts,penalty_kicks,non_penalty_goals,yellow_cards,red_cards
0,Ashley Westwood,eng,Burnley,30-364,MF,29,2600.0,29,28.9,1.0,2.0,0.0,0.0,1.0,6.0,0.0
1,Nick Pope,eng,Burnley,28-346,GK,28,2520.0,28,28.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,James Tarkowski,eng,Burnley,28-132,DF,27,2430.0,27,27.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,Dwight McNeil,eng,Burnley,21-129,MF,25,2317.0,27,25.7,2.0,3.0,0.0,0.0,2.0,1.0,0.0
4,Matthew Lowton,eng,Burnley,31-295,DF,25,2250.0,25,25.0,1.0,0.0,0.0,0.0,1.0,4.0,0.0


In [31]:
df_clubstats_slim = df_clubstats_slim[
    (~df_clubstats_slim.player.str.contains("Squad Total"))
    & ~df_clubstats_slim.player.str.contains("Opponent Total")
]

In [32]:
df_clubstats_slim.fillna(0, inplace=True)

In [33]:
df_clubstats_slim.head()

Unnamed: 0,player,nation,clubname,age,position,starts,minutes,matches_played,playing_time_90,goals,assists,penalty_kick_attempts,penalty_kicks,non_penalty_goals,yellow_cards,red_cards
0,Ashley Westwood,eng,Burnley,30-364,MF,29,2600.0,29,28.9,1.0,2.0,0.0,0.0,1.0,6.0,0.0
1,Nick Pope,eng,Burnley,28-346,GK,28,2520.0,28,28.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,James Tarkowski,eng,Burnley,28-132,DF,27,2430.0,27,27.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,Dwight McNeil,eng,Burnley,21-129,MF,25,2317.0,27,25.7,2.0,3.0,0.0,0.0,2.0,1.0,0.0
4,Matthew Lowton,eng,Burnley,31-295,DF,25,2250.0,25,25.0,1.0,0.0,0.0,0.0,1.0,4.0,0.0


In [34]:
df_clubstats_slim.to_csv("output/club_standard_stats.csv", index=False)

In [37]:
df_clubstats_slim["updated"] = today

In [38]:
cards = (
    df_clubstats_slim.groupby(["clubname", "updated"])
    .agg({"red_cards": sum, "yellow_cards": sum})
    .reset_index()
)

In [51]:
cards = cards[["clubname", "yellow_cards", "red_cards", "updated"]]

In [52]:
cards.sort_values("yellow_cards", ascending=False)

Unnamed: 0,clubname,yellow_cards,red_cards,updated
10,Liverpool,85.0,6.0,2021-03-31 14:11:22.467188
14,Sheffield United,55.0,3.0,2021-03-31 14:11:22.467188
13,Newcastle United,54.0,2.0,2021-03-31 14:11:22.467188
7,Fulham,54.0,3.0,2021-03-31 14:11:22.467188
1,Aston Villa,51.0,2.0,2021-03-31 14:11:22.467188
9,Leicester City,50.0,0.0,2021-03-31 14:11:22.467188
12,Manchester United,48.0,1.0,2021-03-31 14:11:22.467188
8,Leeds United,45.0,0.0,2021-03-31 14:11:22.467188
16,Tottenham Hotspur,44.0,2.0,2021-03-31 14:11:22.467188
19,Wolverhampton Wanderers,44.0,1.0,2021-03-31 14:11:22.467188


In [53]:
cards.sort_values("yellow_cards", ascending=False).to_csv(
    "output/club_cards.csv", index=False
)

---

In [41]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[1]).assign(clubname=n))

fixtures_df = pd.concat(df_list)

In [42]:
fixtures_df.columns = (
    fixtures_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [43]:
fixtures_df.drop(["match_report", "notes"], axis=1, inplace=True)

In [44]:
fixtures_df = fixtures_df[~fixtures_df["result"].isnull()].copy()

In [45]:
fixtures_df["updated"] = today

In [46]:
premier_league_fixtures = fixtures_df[fixtures_df["comp"] == "Premier League"].copy()

In [47]:
premier_league_fixtures.to_csv("output/club_premier_league_fixtures.csv", index=False)

---