# Scraping Premier League stats from FBref

---

### Import Python tools

In [1]:
%load_ext lab_black

In [3]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup as bs

%matplotlib inline
import json
import numpy as np
import altair as alt
import altair_latimes as lat

alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
today = pd.to_datetime("today")

### Premier League clubs

In [5]:
clubs = [
    {"name": "Burnley", "code": "BUR", "country": "England", "id": "943e8050"},
    {
        "name": "Manchester United",
        "code": "MUN",
        "country": "England",
        "id": "19538871",
    },
    {"name": "Manchester City", "code": "MCI", "country": "England", "id": "b8fd03ef"},
    {"name": "Aston Villa", "code": "AVL", "country": "England", "id": "8602292d"},
    {"name": "Fulham", "code": "FUL", "country": "England", "id": "fd962109"},
    {"name": "Arsenal", "code": "ARS", "country": "England", "id": "18bb7c10"},
    {"name": "Crystal Palace", "code": "CRY", "country": "England", "id": "47c64c55"},
    {"name": "Southampton", "code": "SOU", "country": "England", "id": "33c895d4"},
    {"name": "Liverpool", "code": "LIV", "country": "England", "id": "e87167c6"},
    {"name": "Leeds United", "code": "LEE", "country": "England", "id": "5bfb9659"},
    {"name": "West Ham United", "code": "WHU", "country": "England", "id": "52d65cea"},
    {"name": "Newcastle United", "code": "NEW", "country": "England", "id": "b2b47a98"},
    {
        "name": "West Bromwich Albion",
        "code": "WBA",
        "country": "England",
        "id": "60c6b05f",
    },
    {"name": "Leicester City", "code": "LEI", "country": "England", "id": "a2d435b3"},
    {
        "name": "Tottenham Hotspur",
        "code": "TOT",
        "country": "England",
        "id": "361ca564",
    },
    {"name": "Everton", "code": "EVE", "country": "England", "id": "c4989550"},
    {"name": "Sheffield United", "code": "SHU", "country": "England", "id": "1df6b87e"},
    {
        "name": "Wolverhampton Wanderers",
        "code": "WOL",
        "country": "England",
        "id": "8cec06e1",
    },
    {
        "name": "Brighton & Hove Albion",
        "code": "BHA",
        "country": "England",
        "id": "d07537b9",
    },
    {"name": "Chelsea", "code": "CHE", "country": "England", "id": "a6a4e67d"},
]

In [6]:
squad_base = "https://fbref.com/en/squads/"

In [7]:
clubs_df = pd.DataFrame(clubs)

In [8]:
clubs_df["url"] = squad_base + "/" + clubs_df["id"]

In [9]:
clubs_df.to_csv("output/clubs_list.csv", index=False)

In [10]:
clubs_df.head()

Unnamed: 0,name,code,country,id,url
0,Burnley,BUR,England,943e8050,https://fbref.com/en/squads//943e8050
1,Manchester United,MUN,England,19538871,https://fbref.com/en/squads//19538871
2,Manchester City,MCI,England,b8fd03ef,https://fbref.com/en/squads//b8fd03ef
3,Aston Villa,AVL,England,8602292d,https://fbref.com/en/squads//8602292d
4,Fulham,FUL,England,fd962109,https://fbref.com/en/squads//fd962109


---

## League standings overall

In [11]:
url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [12]:
tables = pd.read_html(url)

In [13]:
df_overall = tables[0]

In [14]:
df_overall.rename(
    columns={
        "Rk": "rank",
        "Squad": "name",
        "MP": "matches",
        "W": "wins",
        "D": "draws",
        "L": "losses",
        "GF": "goals_for",
        "GA": "goals_against",
        "GD": "goal_diff",
        "Pts": "points",
        "xG": "expected_goals_for",
        "xGA": "expected_goals_against",
        "xGD": "expected_goal_diff",
        "xGD/90": "expected_goals_diff_90mins",
        "Last 5": "last_five",
        "Attendance": "attendance",
        "Top Team Scorer": "top_scorer",
        "Goalkeeper": "goalkeeper",
        "Notes": "notes",
    },
    inplace=True,
)

In [15]:
df_overall.drop(["notes"], axis=1, inplace=True)

In [16]:
df_overall.head()

Unnamed: 0,rank,name,matches,wins,draws,losses,goals_for,goals_against,goal_diff,points,expected_goals_for,expected_goals_against,expected_goal_diff,expected_goals_diff_90mins,attendance,top_scorer,goalkeeper
0,1,Manchester City,38,27,5,6,83,32,51,86,73.3,31.3,42.0,1.11,526,İlkay Gündoğan - 13,Ederson
1,2,Manchester Utd,38,21,11,6,73,44,29,74,60.2,42.2,18.0,0.47,526,Bruno Fernandes - 18,David de Gea
2,3,Liverpool,38,20,9,9,68,42,26,69,72.6,45.4,27.3,0.72,837,Mohamed Salah - 22,Alisson
3,4,Chelsea,38,19,10,9,58,36,22,67,64.0,32.8,31.2,0.82,526,Jorginho - 7,Edouard Mendy
4,5,Leicester City,38,20,6,12,68,50,18,66,56.0,47.7,8.3,0.22,421,Jamie Vardy - 15,Kasper Schmeichel


In [17]:
df_overall["updated"] = today

In [18]:
df_overall.to_csv("output/league_standings.csv", index=False)

---

### Club performance

In [19]:
df_performance = tables[2]

In [20]:
df_performance.columns = [col[1] for col in df_performance.columns]

In [21]:
df_performance.head()

Unnamed: 0,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A,G-PK.1,G+A-PK,xG,npxG,xA,npxG+xA,xG.1,xA.1,xG+xA,npxG.1,npxG+xA.1
0,Arsenal,29,25.9,53.8,38,418,3420,38.0,53,38,47,6,6,49,5,1.39,1.0,2.39,1.24,2.24,53.5,49.0,36.6,85.5,1.41,0.96,2.37,1.29,2.25
1,Aston Villa,24,25.2,48.1,38,418,3420,38.0,52,38,47,5,6,71,4,1.37,1.0,2.37,1.24,2.24,53.0,48.6,37.2,85.7,1.4,0.98,2.37,1.28,2.26
2,Brighton,27,25.8,51.3,38,418,3420,38.0,39,24,33,6,9,49,6,1.03,0.63,1.66,0.87,1.5,51.6,44.8,33.0,77.8,1.36,0.87,2.23,1.18,2.05
3,Burnley,25,28.3,41.7,38,418,3420,38.0,32,20,29,3,3,48,0,0.84,0.53,1.37,0.76,1.29,39.6,37.3,26.8,64.2,1.04,0.71,1.75,0.98,1.69
4,Chelsea,27,26.0,61.4,38,418,3420,38.0,56,38,48,8,10,51,3,1.47,1.0,2.47,1.26,2.26,64.0,56.4,42.4,98.8,1.68,1.12,2.8,1.48,2.6


In [22]:
df_performance["updated"] = today

In [23]:
df_performance.to_csv("output/club_performance.csv", index=False)

---

### Club standard stats

In [24]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[0]).assign(clubname=n))

df = pd.concat(df_list)

  uniq_tuples = lib.fast_unique_multiple([self._values, other._values], sort=sort)


AttributeError: 'NoneType' object has no attribute 'is_extension'

In [None]:
df_list[0]

In [None]:
df_clubstats = df.copy()

In [None]:
df_clubstats.columns = ["_".join(col).strip() for col in df_clubstats.columns.values]

In [None]:
df_clubstats.columns = (
    df_clubstats.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
df_clubstats.rename(
    columns={
        "performance_ast": "assists",
        "performance_crdr": "red_cards",
        "performance_crdy": "yellow_cards",
        "performance_g_pk": "non_penalty_goals",
        "performance_gls": "goals",
        "performance_pk": "penalty_kicks",
        "performance_pkatt": "penalty_kick_attempts",
        "playing_time_90s": "playing_time_90",
        "playing_time_mp": "matches_played",
        "playing_time_min": "minutes",
        "playing_time_starts": "starts",
        "unnamed:_0_level_0_player": "player",
        "unnamed:_1_level_0_nation": "nation",
        "unnamed:_20_level_0_matches": "matches",
        "unnamed:_29_level_0_matches": "matches2",
        "unnamed:_2_level_0_pos": "position",
        "unnamed:_3_level_0_age": "age",
        "clubname_": "clubname",
    },
    inplace=True,
)

In [None]:
df_clubstats_slim = df_clubstats[
    [
        "player",
        "nation",
        "clubname",
        "age",
        "position",
        "starts",
        "minutes",
        "matches_played",
        "playing_time_90",
        "goals",
        "assists",
        "penalty_kick_attempts",
        "penalty_kicks",
        "non_penalty_goals",
        "yellow_cards",
        "red_cards",
    ]
].copy()

In [None]:
df_clubstats_slim["nation"] = df_clubstats_slim["nation"].str.split(
    " ", n=1, expand=True
)

In [None]:
df_clubstats_slim.head()

In [None]:
df_clubstats_slim = df_clubstats_slim[
    (~df_clubstats_slim.player.str.contains("Squad Total"))
    & ~df_clubstats_slim.player.str.contains("Opponent Total")
]

In [None]:
df_clubstats_slim.fillna(0, inplace=True)

In [None]:
df_clubstats_slim.head()

In [None]:
df_clubstats_slim.to_csv("output/club_standard_stats.csv", index=False)

In [None]:
df_clubstats_slim["updated"] = today

In [None]:
cards = (
    df_clubstats_slim.groupby(["clubname", "updated"])
    .agg({"red_cards": sum, "yellow_cards": sum})
    .reset_index()
)

In [None]:
cards = cards[["clubname", "yellow_cards", "red_cards", "updated"]]

In [None]:
cards.sort_values("yellow_cards", ascending=False)

In [None]:
cards.sort_values("yellow_cards", ascending=False).to_csv(
    "output/club_cards.csv", index=False
)

---

In [None]:
df_list = []

for i, n in zip(clubs_df["url"], clubs_df["name"]):
    df_list.append((pd.read_html(i)[1]).assign(clubname=n))

fixtures_df = pd.concat(df_list)

In [None]:
fixtures_df.columns = (
    fixtures_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
fixtures_df.drop(["match_report", "notes"], axis=1, inplace=True)

In [None]:
fixtures_df = fixtures_df[~fixtures_df["result"].isnull()].copy()

In [None]:
fixtures_df["updated"] = today

In [None]:
premier_league_fixtures = fixtures_df[fixtures_df["comp"] == "Premier League"].copy()

In [None]:
premier_league_fixtures.to_csv("output/club_premier_league_fixtures.csv", index=False)

---

### Seasons

In [None]:
seasons_url = "https://fbref.com/en/comps/9/history/Premier-League-Seasons"

In [None]:
seasons_df = pd.read_html(seasons_url)[0]

In [None]:
seasons_df.columns = (
    seasons_df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace("-", "_")
)

In [None]:
seasons_df[["top_scorer", "top_scorer_goals"]] = seasons_df["top_scorer"].str.split(
    " - ", 1, expand=True
)

In [None]:
seasons_df["updated"] = today

In [None]:
r = requests.get("https://fbref.com/en/comps/9/history/Premier-League-Seasons")
soup = bs(r.content, "html")

In [None]:
links = pd.DataFrame(
    [
        (i.text, "https://fbref.com/en/comps/9" + i["href"])
        for i in soup.select("tbody th:nth-of-type(1) a")
    ],
    columns=["season", "link"],
)

In [None]:
seasons_w_link = pd.merge(seasons_df, links, on="season")

In [None]:
seasons_w_link.to_csv("output/premier_league_winners_history.csv", index=False)