# Scraping Premier League stats from FBref

---

### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup as bs

%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()



DataTransformerRegistry.enable('default')

In [3]:
today = pd.to_datetime("today")

### Premier League clubs

In [4]:
clubs = [
    {"name": "Burnley", "code": "BUR", "country": "England", "id": "943e8050"},
    {
        "name": "Manchester United",
        "code": "MUN",
        "country": "England",
        "id": "19538871",
    },
    {"name": "Manchester City", "code": "MCI", "country": "England", "id": "b8fd03ef"},
    {"name": "Aston Villa", "code": "AVL", "country": "England", "id": "8602292d"},
    {"name": "Arsenal", "code": "ARS", "country": "England", "id": "18bb7c10"},
    {"name": "Crystal Palace", "code": "CRY", "country": "England", "id": "47c64c55"},
    {"name": "Southampton", "code": "SOU", "country": "England", "id": "33c895d4"},
    {"name": "Liverpool", "code": "LIV", "country": "England", "id": "e87167c6"},
    {"name": "Leeds United", "code": "LEE", "country": "England", "id": "5bfb9659"},
    {"name": "West Ham United", "code": "WHU", "country": "England", "id": "52d65cea"},
    {"name": "Newcastle United", "code": "NEW", "country": "England", "id": "b2b47a98"},
    {"name": "Leicester City", "code": "LEI", "country": "England", "id": "a2d435b3"},
    {
        "name": "Tottenham Hotspur",
        "code": "TOT",
        "country": "England",
        "id": "361ca564",
    },
    {"name": "Everton", "code": "EVE", "country": "England", "id": "c4989550"},
    {
        "name": "Wolverhampton Wanderers",
        "code": "WOL",
        "country": "England",
        "id": "8cec06e1",
    },
    {
        "name": "Brighton & Hove Albion",
        "code": "BHA",
        "country": "England",
        "id": "d07537b9",
    },
    {"name": "Chelsea", "code": "CHE", "country": "England", "id": "a6a4e67d"},
    {"name": "Brentford", "code": "", "country": "England", "id": "cd051869"},
    {"name": "Norwich City", "code": "", "country": "England", "id": "1c781004"},
    {"name": "Watford", "code": "", "country": "England", "id": "2abfe087"},
]

In [5]:
squad_base = "https://fbref.com/en/squads"

In [6]:
clubs_df = pd.DataFrame(clubs)

In [7]:
clubs_df["url"] = squad_base + "/" + clubs_df["id"]

In [8]:
clubs_df.to_csv("output/clubs_list.csv", index=False)

In [24]:
clubs_df

Unnamed: 0,name,code,country,id,url
0,Burnley,BUR,England,943e8050,https://fbref.com/en/squads/943e8050
1,Manchester United,MUN,England,19538871,https://fbref.com/en/squads/19538871
2,Manchester City,MCI,England,b8fd03ef,https://fbref.com/en/squads/b8fd03ef
3,Aston Villa,AVL,England,8602292d,https://fbref.com/en/squads/8602292d
4,Arsenal,ARS,England,18bb7c10,https://fbref.com/en/squads/18bb7c10
5,Crystal Palace,CRY,England,47c64c55,https://fbref.com/en/squads/47c64c55
6,Southampton,SOU,England,33c895d4,https://fbref.com/en/squads/33c895d4
7,Liverpool,LIV,England,e87167c6,https://fbref.com/en/squads/e87167c6
8,Leeds United,LEE,England,5bfb9659,https://fbref.com/en/squads/5bfb9659
9,West Ham United,WHU,England,52d65cea,https://fbref.com/en/squads/52d65cea


---

## League standings overall

In [10]:
url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [11]:
tables = pd.read_html(url)

In [12]:
df_overall = tables[0]

In [13]:
df_overall.rename(
    columns={
        "Rk": "rank",
        "Squad": "name",
        "MP": "matches",
        "W": "wins",
        "D": "draws",
        "L": "losses",
        "GF": "goals_for",
        "GA": "goals_against",
        "GD": "goal_diff",
        "Pts": "points",
        "xG": "expected_goals_for",
        "xGA": "expected_goals_against",
        "xGD": "expected_goal_diff",
        "xGD/90": "expected_goals_diff_90mins",
        "Last 5": "last_five",
        "Attendance": "attendance",
        "Top Team Scorer": "top_scorer",
        "Goalkeeper": "goalkeeper",
        "Notes": "notes",
    },
    inplace=True,
)

In [14]:
df_overall.drop(["notes"], axis=1, inplace=True)

In [15]:
df_overall.head()

Unnamed: 0,rank,name,matches,wins,draws,losses,goals_for,goals_against,goal_diff,points,expected_goals_for,expected_goals_against,expected_goal_diff,expected_goals_diff_90mins,last_five,attendance,top_scorer,goalkeeper
0,1,West Ham,3,2,1,0,10,5,5,7,6.0,2.9,3.1,1.03,W W D,59901,Michail Antonio - 4,Łukasz Fabiański
1,2,Chelsea,3,2,1,0,6,1,5,7,4.6,3.3,1.3,0.45,W W D,38965,"Trevoh Chalobah, Reece James... - 1",Edouard Mendy
2,3,Liverpool,3,2,1,0,6,1,5,7,7.6,2.9,4.7,1.56,W W D,52591,"Mohamed Salah, Diogo Jota - 2",Alisson
3,4,Everton,3,2,1,0,7,3,4,7,6.1,3.2,2.9,0.97,W D W,38487,Dominic Calvert-Lewin - 3,Jordan Pickford
4,5,Manchester City,3,2,0,1,10,1,9,6,8.4,1.5,6.9,2.3,L W W,51857,Ferrán Torres - 2,Ederson


In [16]:
df_overall["updated"] = today

In [17]:
df_overall.to_csv("output/league_standings.csv", index=False)

---

### Club performance

In [18]:
df_performance = tables[2]

In [19]:
df_performance.columns = [col[1] for col in df_performance.columns]

In [20]:
df_performance.head()

Unnamed: 0,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,xG,npxG,xA,npxG+xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA.1
0,Arsenal,22,25.5,39.7,3,33,270,3.0,0,0,0,0,0,5,1,0.0,0.0,0.0,0.0,0.0,1.9,1.9,1.8,3.8,0.65,0.62,1.26,0.65,1.26
1,Aston Villa,21,26.5,52.0,3,33,270,3.0,5,3,3,2,2,7,0,1.67,1.0,2.67,1.0,2.0,2.8,1.4,1.0,2.4,0.93,0.35,1.27,0.46,0.81
2,Brentford,18,25.5,45.0,3,33,270,3.0,3,2,3,0,0,4,0,1.0,0.67,1.67,1.0,1.67,3.4,3.4,2.6,6.0,1.12,0.88,2.0,1.12,2.0
3,Brighton,18,26.9,62.3,3,33,270,3.0,4,4,4,0,0,6,0,1.33,1.33,2.67,1.33,2.67,3.7,3.7,2.5,6.1,1.23,0.82,2.05,1.23,2.05
4,Burnley,15,29.6,34.5,2,22,180,2.0,1,1,1,0,0,2,0,0.5,0.5,1.0,0.5,1.0,1.9,1.9,1.4,3.2,0.93,0.68,1.61,0.93,1.61


In [21]:
df_performance["updated"] = today

In [22]:
df_performance.to_csv("output/club_performance.csv", index=False)

---

### Club standard stats

In [31]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[0]).assign(name=n))

df = pd.concat(df_list)

TypeError: read_html() got an unexpected keyword argument 'skipfooter'

In [26]:
df_list

[         Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                      Player             Nation                Pos   
 0            Matthew Lowton            eng ENG                 DF   
 1                   Ben Mee            eng ENG                 DF   
 2                 Nick Pope            eng ENG                 GK   
 3           James Tarkowski            eng ENG                 DF   
 4            Charlie Taylor            eng ENG                 DF   
 5                 Jack Cork            eng ENG                 MF   
 6   Jóhann Berg Guðmundsson             is ISL                 MF   
 7             Dwight McNeil            eng ENG                 MF   
 8                Chris Wood             nz NZL                 FW   
 9             Ashley Barnes            eng ENG                 FW   
 10           Josh Brownhill            eng ENG                 MF   
 11          Ashley Westwood            eng ENG                 MF   
 12            Jay R

In [None]:
df_list[0]

In [None]:
df_clubstats = df.copy()

In [None]:
df_clubstats.columns = ["_".join(col).strip() for col in df_clubstats.columns.values]

In [None]:
df_clubstats.columns = (
    df_clubstats.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
df_clubstats.rename(
    columns={
        "performance_ast": "assists",
        "performance_crdr": "red_cards",
        "performance_crdy": "yellow_cards",
        "performance_g_pk": "non_penalty_goals",
        "performance_gls": "goals",
        "performance_pk": "penalty_kicks",
        "performance_pkatt": "penalty_kick_attempts",
        "playing_time_90s": "playing_time_90",
        "playing_time_mp": "matches_played",
        "playing_time_min": "minutes",
        "playing_time_starts": "starts",
        "unnamed:_0_level_0_player": "player",
        "unnamed:_1_level_0_nation": "nation",
        "unnamed:_20_level_0_matches": "matches",
        "unnamed:_29_level_0_matches": "matches2",
        "unnamed:_2_level_0_pos": "position",
        "unnamed:_3_level_0_age": "age",
        "clubname_": "clubname",
    },
    inplace=True,
)

In [None]:
df_clubstats_slim = df_clubstats[
    [
        "player",
        "nation",
        "clubname",
        "age",
        "position",
        "starts",
        "minutes",
        "matches_played",
        "playing_time_90",
        "goals",
        "assists",
        "penalty_kick_attempts",
        "penalty_kicks",
        "non_penalty_goals",
        "yellow_cards",
        "red_cards",
    ]
].copy()

In [None]:
df_clubstats_slim["nation"] = df_clubstats_slim["nation"].str.split(
    " ", n=1, expand=True
)

In [None]:
df_clubstats_slim.head()

In [None]:
df_clubstats_slim = df_clubstats_slim[
    (~df_clubstats_slim.player.str.contains("Squad Total"))
    & ~df_clubstats_slim.player.str.contains("Opponent Total")
]

In [None]:
df_clubstats_slim.fillna(0, inplace=True)

In [None]:
df_clubstats_slim.head()

In [None]:
df_clubstats_slim.to_csv("output/club_standard_stats.csv", index=False)

In [None]:
df_clubstats_slim["updated"] = today

In [None]:
cards = (
    df_clubstats_slim.groupby(["clubname", "updated"])
    .agg({"red_cards": sum, "yellow_cards": sum})
    .reset_index()
)

In [None]:
cards = cards[["clubname", "yellow_cards", "red_cards", "updated"]]

In [None]:
cards.sort_values("yellow_cards", ascending=False)

In [None]:
cards.sort_values("yellow_cards", ascending=False).to_csv(
    "output/club_cards.csv", index=False
)

---

In [None]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[1]).assign(clubname=n))

fixtures_df = pd.concat(df_list)

In [None]:
fixtures_df.columns = (
    fixtures_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
fixtures_df.drop(["match_report", "notes"], axis=1, inplace=True)

In [None]:
fixtures_df = fixtures_df[~fixtures_df["result"].isnull()].copy()

In [None]:
fixtures_df["updated"] = today

In [None]:
premier_league_fixtures = fixtures_df[fixtures_df["comp"] == "Premier League"].copy()

In [None]:
premier_league_fixtures.to_csv("output/club_premier_league_fixtures.csv", index=False)

---

### Seasons

In [None]:
seasons_url = "https://fbref.com/en/comps/9/history/Premier-League-Seasons"

In [None]:
seasons_df = pd.read_html(seasons_url)[0]

In [None]:
seasons_df.columns = (
    seasons_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
seasons_df[["top_scorer", "top_scorer_goals"]] = seasons_df["top_scorer"].str.split(
    " - ", 1, expand=True
)

In [None]:
seasons_df["updated"] = today

In [None]:
r = requests.get("https://fbref.com/en/comps/9/history/Premier-League-Seasons")
soup = bs(r.content, "html")

In [None]:
links = pd.DataFrame(
    [
        (i.text, "https://fbref.com/en/comps/9" + i["href"])
        for i in soup.select("tbody th:nth-of-type(1) a")
    ],
    columns=["season", "link"],
)

In [None]:
seasons_w_link = pd.merge(seasons_df, links, on="season")

In [None]:
seasons_w_link.to_csv("output/premier_league_winners_history.csv", index=False)