# Scraping Premier League stats from FBref

---

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import jenkspy
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup as bs

%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
today = pd.to_datetime("today")

### Premier League clubs

In [4]:
clubs = [
    {"name": "Burnley", "code": "BUR", "country": "England", "id": "943e8050"},
    {
        "name": "Manchester United",
        "code": "MUN",
        "country": "England",
        "id": "19538871",
    },
    {"name": "Manchester City", "code": "MCI", "country": "England", "id": "b8fd03ef"},
    {"name": "Aston Villa", "code": "AVL", "country": "England", "id": "8602292d"},
    {"name": "Fulham", "code": "FUL", "country": "England", "id": "fd962109"},
    {"name": "Arsenal", "code": "ARS", "country": "England", "id": "18bb7c10"},
    {"name": "Crystal Palace", "code": "CRY", "country": "England", "id": "47c64c55"},
    {"name": "Southampton", "code": "SOU", "country": "England", "id": "33c895d4"},
    {"name": "Liverpool", "code": "LIV", "country": "England", "id": "e87167c6"},
    {"name": "Leeds United", "code": "LEE", "country": "England", "id": "5bfb9659"},
    {"name": "West Ham United", "code": "WHU", "country": "England", "id": "52d65cea"},
    {"name": "Newcastle United", "code": "NEW", "country": "England", "id": "b2b47a98"},
    {
        "name": "West Bromwich Albion",
        "code": "WBA",
        "country": "England",
        "id": "60c6b05f",
    },
    {"name": "Leicester City", "code": "LEI", "country": "England", "id": "a2d435b3"},
    {
        "name": "Tottenham Hotspur",
        "code": "TOT",
        "country": "England",
        "id": "361ca564",
    },
    {"name": "Everton", "code": "EVE", "country": "England", "id": "c4989550"},
    {"name": "Sheffield United", "code": "SHU", "country": "England", "id": "1df6b87e"},
    {
        "name": "Wolverhampton Wanderers",
        "code": "WOL",
        "country": "England",
        "id": "8cec06e1",
    },
    {
        "name": "Brighton & Hove Albion",
        "code": "BHA",
        "country": "England",
        "id": "d07537b9",
    },
    {"name": "Chelsea", "code": "CHE", "country": "England", "id": "a6a4e67d"},
]

In [5]:
squad_base = "https://fbref.com/en/squads/"

In [6]:
clubs_df = pd.DataFrame(clubs)

In [7]:
clubs_df["url"] = squad_base + "/" + clubs_df["id"]

In [8]:
clubs_df.to_csv("output/clubs_list.csv", index=False)

In [9]:
clubs_df.head()

Unnamed: 0,name,code,country,id,url
0,Burnley,BUR,England,943e8050,https://fbref.com/en/squads//943e8050
1,Manchester United,MUN,England,19538871,https://fbref.com/en/squads//19538871
2,Manchester City,MCI,England,b8fd03ef,https://fbref.com/en/squads//b8fd03ef
3,Aston Villa,AVL,England,8602292d,https://fbref.com/en/squads//8602292d
4,Fulham,FUL,England,fd962109,https://fbref.com/en/squads//fd962109


---

## League standings overall

In [10]:
url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [11]:
tables = pd.read_html(url)

In [12]:
df_overall = tables[0]

In [13]:
df_overall.rename(
    columns={
        "Rk": "rank",
        "Squad": "name",
        "MP": "matches",
        "W": "wins",
        "D": "draws",
        "L": "losses",
        "GF": "goals_for",
        "GA": "goals_against",
        "GD": "goal_diff",
        "Pts": "points",
        "xG": "expected_goals_for",
        "xGA": "expected_goals_against",
        "xGD": "expected_goal_diff",
        "xGD/90": "expected_goals_diff_90mins",
        "Last 5": "last_five",
        "Attendance": "attendance",
        "Top Team Scorer": "top_scorer",
        "Goalkeeper": "goalkeeper",
        "Notes": "notes",
    },
    inplace=True,
)

In [14]:
df_overall.drop(["notes"], axis=1, inplace=True)

In [15]:
df_overall.head()

Unnamed: 0,rank,name,matches,wins,draws,losses,goals_for,goals_against,goal_diff,points,expected_goals_for,expected_goals_against,expected_goal_diff,expected_goals_diff_90mins,last_five,attendance,top_scorer,goalkeeper
0,1,Manchester City,31,23,5,3,66,21,45,74,60.3,24.0,36.4,1.17,W L W W W,,İlkay Gündoğan - 12,Ederson
1,2,Manchester Utd,30,17,9,4,58,33,25,60,47.2,35.2,12.0,0.4,D D W W W,,Bruno Fernandes - 16,David de Gea
2,3,Leicester City,30,17,5,8,53,34,19,56,44.3,36.7,7.6,0.25,L D W W L,,Jamie Vardy - 12,Kasper Schmeichel
3,4,West Ham,30,15,7,8,48,37,11,52,41.6,36.3,5.3,0.18,L W L D W,133.0,Tomáš Souček - 9,Łukasz Fabiański
4,5,Chelsea,30,14,9,7,46,30,16,51,48.4,26.3,22.1,0.74,D W W D L,133.0,"Tammy Abraham, Mason Mount... - 6",Edouard Mendy


In [16]:
df_overall["updated"] = today

In [17]:
df_overall.to_csv("output/league_standings.csv", index=False)

---

### Club performance

In [18]:
df_performance = tables[2]

In [19]:
df_performance.columns = [col[1] for col in df_performance.columns]

In [20]:
df_performance.head()

Unnamed: 0,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,xG,npxG,xA,npxG+xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA.1
0,Arsenal,29,26.8,52.6,30,330,2700,30.0,38,27,32,6,6,43,5,1.27,0.9,2.17,1.07,1.97,41.3,36.8,27.0,63.8,1.38,0.9,2.28,1.23,2.13
1,Aston Villa,21,25.9,49.1,29,319,2610,29.0,40,32,37,3,4,52,2,1.38,1.1,2.48,1.28,2.38,41.5,38.6,30.0,68.5,1.43,1.03,2.47,1.33,2.36
2,Brighton,26,26.4,51.6,30,330,2700,30.0,32,21,27,5,8,39,3,1.07,0.7,1.77,0.9,1.6,41.8,35.7,26.4,62.1,1.39,0.88,2.27,1.19,2.07
3,Burnley,24,29.1,41.1,30,330,2700,30.0,23,12,21,2,2,39,0,0.77,0.4,1.17,0.7,1.1,28.7,27.2,19.2,46.4,0.96,0.64,1.6,0.91,1.55
4,Chelsea,27,26.8,61.7,30,330,2700,30.0,44,29,37,7,9,38,2,1.47,0.97,2.43,1.23,2.2,48.4,41.6,31.5,73.0,1.61,1.05,2.66,1.39,2.43


In [21]:
df_performance["updated"] = today

In [22]:
df_performance.to_csv("output/club_performance.csv", index=False)

---

### Club standard stats

In [23]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[0]).assign(clubname=n))

df = pd.concat(df_list)

In [30]:
df_list[0]

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Unnamed: 29_level_0,clubname
Unnamed: 0_level_1,Player,Nation,Pos,Age,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls,Ast,G+A,G-PK,G+A-PK,xG,npxG,xA,npxG+xA,xG,xA,xG+xA,npxG,npxG+xA,Matches,Unnamed: 31_level_1
0,Ashley Westwood,eng ENG,MF,31-008,30,30,2690.0,29.9,1.0,2.0,1.0,0.0,0.0,6.0,0.0,0.03,0.07,0.1,0.03,0.1,0.4,0.4,4.1,4.5,0.01,0.14,0.15,0.01,0.15,Matches,Burnley
1,Nick Pope,eng ENG,GK,28-355,29,29,2610.0,29.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.4,0.0,0.01,0.01,0.0,0.01,Matches,Burnley
2,James Tarkowski,eng ENG,DF,28-141,28,28,2520.0,28.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6,1.6,0.5,2.1,0.06,0.02,0.07,0.06,0.07,Matches,Burnley
3,Dwight McNeil,eng ENG,MF,21-138,28,26,2407.0,26.7,2.0,3.0,2.0,0.0,0.0,1.0,0.0,0.07,0.11,0.19,0.07,0.19,0.8,0.8,4.2,5.1,0.03,0.16,0.19,0.03,0.19,Matches,Burnley
4,Matthew Lowton,eng ENG,DF,31-304,26,26,2340.0,26.0,1.0,0.0,1.0,0.0,0.0,4.0,0.0,0.04,0.0,0.04,0.04,0.04,0.4,0.4,1.5,1.9,0.02,0.06,0.07,0.02,0.07,Matches,Burnley
5,Josh Brownhill,eng ENG,MF,25-111,25,24,2141.0,23.8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2,1.2,0.8,2.0,0.05,0.03,0.08,0.05,0.08,Matches,Burnley
6,Chris Wood,nz NZL,FW,29-123,25,24,2057.0,22.9,7.0,1.0,6.0,1.0,1.0,0.0,0.0,0.31,0.04,0.35,0.26,0.31,9.3,8.6,1.2,9.8,0.41,0.05,0.46,0.38,0.43,Matches,Burnley
7,Ben Mee,eng ENG,DF,31-198,23,23,2063.0,22.9,2.0,0.0,2.0,0.0,0.0,2.0,0.0,0.09,0.0,0.09,0.09,0.09,1.3,1.3,0.1,1.5,0.06,0.01,0.06,0.06,0.06,Matches,Burnley
8,Charlie Taylor,eng ENG,DF,27-203,22,21,1796.0,20.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.05,0.05,0.0,0.05,0.1,0.1,1.1,1.1,0.0,0.05,0.06,0.0,0.06,Matches,Burnley
9,Ashley Barnes,eng ENG,FW,31-161,18,15,1293.0,14.4,3.0,0.0,2.0,1.0,1.0,4.0,0.0,0.21,0.0,0.21,0.14,0.14,4.9,4.1,0.7,4.8,0.34,0.05,0.39,0.29,0.33,Matches,Burnley


In [None]:
df_clubstats = df.copy()

In [None]:
df_clubstats.columns = ["_".join(col).strip() for col in df_clubstats.columns.values]

In [None]:
df_clubstats.columns = (
    df_clubstats.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
df_clubstats.rename(
    columns={
        "performance_ast": "assists",
        "performance_crdr": "red_cards",
        "performance_crdy": "yellow_cards",
        "performance_g_pk": "non_penalty_goals",
        "performance_gls": "goals",
        "performance_pk": "penalty_kicks",
        "performance_pkatt": "penalty_kick_attempts",
        "playing_time_90s": "playing_time_90",
        "playing_time_mp": "matches_played",
        "playing_time_min": "minutes",
        "playing_time_starts": "starts",
        "unnamed:_0_level_0_player": "player",
        "unnamed:_1_level_0_nation": "nation",
        "unnamed:_20_level_0_matches": "matches",
        "unnamed:_29_level_0_matches": "matches2",
        "unnamed:_2_level_0_pos": "position",
        "unnamed:_3_level_0_age": "age",
        "clubname_": "clubname",
    },
    inplace=True,
)

In [None]:
df_clubstats_slim = df_clubstats[
    [
        "player",
        "nation",
        "clubname",
        "age",
        "position",
        "starts",
        "minutes",
        "matches_played",
        "playing_time_90",
        "goals",
        "assists",
        "penalty_kick_attempts",
        "penalty_kicks",
        "non_penalty_goals",
        "yellow_cards",
        "red_cards",
    ]
].copy()

In [None]:
df_clubstats_slim["nation"] = df_clubstats_slim["nation"].str.split(
    " ", n=1, expand=True
)

In [None]:
df_clubstats_slim.head()

In [None]:
df_clubstats_slim = df_clubstats_slim[
    (~df_clubstats_slim.player.str.contains("Squad Total"))
    & ~df_clubstats_slim.player.str.contains("Opponent Total")
]

In [None]:
df_clubstats_slim.fillna(0, inplace=True)

In [None]:
df_clubstats_slim.head()

In [None]:
df_clubstats_slim.to_csv("output/club_standard_stats.csv", index=False)

In [None]:
df_clubstats_slim["updated"] = today

In [None]:
cards = (
    df_clubstats_slim.groupby(["clubname", "updated"])
    .agg({"red_cards": sum, "yellow_cards": sum})
    .reset_index()
)

In [None]:
cards = cards[["clubname", "yellow_cards", "red_cards", "updated"]]

In [None]:
cards.sort_values("yellow_cards", ascending=False)

In [None]:
cards.sort_values("yellow_cards", ascending=False).to_csv(
    "output/club_cards.csv", index=False
)

---

In [None]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[1]).assign(clubname=n))

fixtures_df = pd.concat(df_list)

In [None]:
fixtures_df.columns = (
    fixtures_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
fixtures_df.drop(["match_report", "notes"], axis=1, inplace=True)

In [None]:
fixtures_df = fixtures_df[~fixtures_df["result"].isnull()].copy()

In [None]:
fixtures_df["updated"] = today

In [None]:
premier_league_fixtures = fixtures_df[fixtures_df["comp"] == "Premier League"].copy()

In [None]:
premier_league_fixtures.to_csv("output/club_premier_league_fixtures.csv", index=False)

---

### Seasons

In [None]:
seasons_url = "https://fbref.com/en/comps/9/history/Premier-League-Seasons"

In [None]:
seasons_df = pd.read_html(seasons_url)[0]

In [None]:
seasons_df.columns = (
    seasons_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
seasons_df[["top_scorer", "top_scorer_goals"]] = seasons_df["top_scorer"].str.split(
    " - ", 1, expand=True
)

In [None]:
seasons_df["updated"] = today

In [None]:
r = requests.get("https://fbref.com/en/comps/9/history/Premier-League-Seasons")
soup = bs(r.content, "html")

In [None]:
links = pd.DataFrame(
    [
        (i.text, "https://fbref.com/en/comps/9" + i["href"])
        for i in soup.select("tbody th:nth-of-type(1) a")
    ],
    columns=["season", "link"],
)

In [None]:
seasons_w_link = pd.merge(seasons_df, links, on="season")

In [None]:
seasons_w_link.to_csv("output/premier_league_winners_history.csv", index=False)