# Leaderboard Scraper

## Environment Setup


**Installing cloudscraper**

In [1]:
#!pip install cloudscraper

**Import libaries**

In [2]:
import re
import pandas as pd
import cloudscraper
import warnings

**Ignore future warnings**

In [None]:
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

## Functions for Scraping and Saving Team-Level Leaderboard Tables

This section contains the functions used to download and clean team-level performance tables from FBref. These tables include season-wide attacking, passing, defensive, and overall metrics for all Premier League clubs. These functions ensure that all team-level data is consistently formatted across seasons and can be integrated smoothly with the fixture data.

In [3]:
def scrape_fbref_leaderboard(url, table_id):
    scraper = cloudscraper.create_scraper()
    html = scraper.get(url).text

    tables = pd.read_html(html, attrs={"id": table_id})
    if not tables:
        raise ValueError(f"Table with id '{table_id}' wasn't found on the page {url}")

    df = tables[0]

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            " ".join([str(c) for c in col if str(c) != ""]).strip()
            for col in df.columns
        ]
    else:
        df.columns = [str(c).strip() for c in df.columns]

    def clean_col(col):
        col = str(col)
        col = col.replace("\xa0", " ")
        col = re.sub(r"[_Â·]+", " ", col)
        col = re.sub(r"\s+", " ", col)
        return col.strip()

    df.columns = [clean_col(c) for c in df.columns]

    # Cleaning the df - removing unnecessary features.
    cols_to_drop = ["Attendance", "Top Team Scorer", "Goalkeeper", "Notes"]
    existing_to_drop = [c for c in cols_to_drop if c in df.columns]
    if existing_to_drop:
        df = df.drop(columns=existing_to_drop)

    return df


# Save the created df to a csv, path is specified in the filename as seen below.
def save_to_csv(df, filename, index = False):
    df.to_csv(filename, index=index)

## Scraping and Saving Team-Level Leaderboard Tables

For every Premier League season in our dataset, we download the full team standings table from FBref.

In [4]:
standings25_26 = scrape_fbref_leaderboard("https://fbref.com/en/comps/9/Premier-League-Stats", "results2025-202691_overall")
save_to_csv(standings25_26, "../CSV_files/Season_2025-2026/2025_2026_Team_Leaderboard.csv")

In [10]:
standings24_25 = scrape_fbref_leaderboard("https://fbref.com/en/comps/9/2024-2025/2024-2025-Premier-League-Stats", "results2024-202591_overall")
save_to_csv(standings24_25, "../CSV_files/Season_2024-2025/2024_2025_Team_Leaderboard.csv")

In [14]:
standings23_24 = scrape_fbref_leaderboard("https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats", "results2023-202491_overall")
save_to_csv(standings23_24, "../CSV_files/Season_2023-2024/2023_2024_Team_Leaderboard.csv")

In [16]:
standings22_23 = scrape_fbref_leaderboard("https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats", "results2022-202391_overall")
save_to_csv(standings22_23, "../CSV_files/Season_2022-2023/2022_2023_Team_Leaderboard.csv")

In [21]:
standings21_22 = scrape_fbref_leaderboard("https://fbref.com/en/comps/9/2021-2022/2021-2022-Premier-League-Stats", "results2021-202291_overall")
save_to_csv(standings21_22, "../CSV_files/Season_2021-2022/2021_2022_Team_Leaderboard.csv")

In [23]:
standings20_21 = scrape_fbref_leaderboard("https://fbref.com/en/comps/9/2020-2021/2020-2021-Premier-League-Stats", "results2020-202191_overall")
save_to_csv(standings20_21, "../CSV_files/Season_2020-2021/2020_2021_Team_Leaderboard.csv")