In [1]:
# IGNORE FUTURE WARNINGS
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import cloudscraper
import pandas as pd
from collections import defaultdict

In [3]:
# SCRAPER FOR EACH TABLE
def scrape_team_stats(url, attrs):
    scraper = cloudscraper.create_scraper()
    html = scraper.get(url).text
    tables = pd.read_html(html, attrs=attrs, header=1)
    df = tables[0]
    df = df.loc[:, ~df.columns.str.startswith("Unnamed")]

    return df

In [4]:
# ARRAY WHICH INCLUDES THE SEASONS WE WANT TO USE
seasons = [
    "2020-2021",
    "2021-2022",
    "2022-2023",
    "2023-2024",
    "2024-2025",
    "2025-2026"
]

alldata = {}

In [5]:
# MERGER FOR A SINGLE SEASON
def merge_squad_player_stats_per_season(data):
    merged_seasons = {}
    for season, df in data.items():
        merged = None
        for name, df in df.items():
            df = df.rename(columns={col: f"{col}" for col in df.columns if col != "Squad"})
        
            if merged is None:
                merged = df
            else:
                merged = merged.merge(df, on="Squad", how="outer")
            merged_seasons[season] = merged

    return merged_seasons

In [6]:
# CLEAN POSSESSION STATS
def clean_and_save_possession_stats(df):
    if all(col in df.columns for col in ["PrgR", "90s"]):
        df["PrgR_per90"] = (df["PrgR"] / df["90s"]).round(2)
    keep_cols = ["Squad", "Poss", "PrgR", "PrgR_per90"]
    df = df[[c for c in keep_cols if c in df.columns]]

    is_against = df["Squad"].astype(str).str.contains("vs ").any()

    if is_against:
        df["Squad"] = df["Squad"].str.replace("vs ", "", regex=False)

    rename_cols = {
        "Poss": "Possession",
        "PrgR_per90": "Progressive_Passes_Received_Per90"
    }
    df = df.rename(columns=rename_cols)

    df = df.drop(columns=["PrgR", "90s"], errors="ignore")

    if is_against:
        df.rename(
            columns={col: col + "_Against" for col in df.columns if col != "Squad"},
            inplace=True
        )

    return df

In [7]:
# POSSESSION FOR
base_url = "https://fbref.com/en/comps/9/{season}/possession/Premier-League-Stats"
attrs = {"id": "stats_squads_possession_for"}
for season in seasons:
    url = base_url.format(season=season)
    df = scrape_team_stats(url, attrs)
    data = clean_and_save_possession_stats(df)
    if season not in alldata:
        alldata[season] = {}
    alldata[season]["possession_for"] = data

In [8]:
# POSSESSION AGAINST
base_url_against = "https://fbref.com/en/comps/9/{season}/possession/Premier-League-Stats"
attrs_against = {"id": "stats_squads_possession_against"}
for season in seasons:
    url = base_url_against.format(season=season)
    df = scrape_team_stats(url, attrs_against)
    data = clean_and_save_possession_stats(df)
    if season not in alldata:
        alldata[season] = {}
    alldata[season]["possession_against"] = data

In [9]:
# CLEAN SHOOTING STATS
def clean_and_save_shooting_stats(df):
    keep_cols = ["Squad", "Sh/90", "SoT/90"]
    df = df[keep_cols]

    is_against = df["Squad"].astype(str).str.contains("vs ").any()
    if is_against:
        df["Squad"] = df["Squad"].str.replace("vs ", "", regex=False)
        cols = [c for c in df.columns if c != "Squad"]
        df = df.rename(columns={c: c + "_Against" for c in cols})
    rename_cols = {
        "Sh/90": "Shots_Per_90",
        "SoT/90": "Shots_On_Target_Per_90"
    }

    df = df.rename(columns=rename_cols)
    return df

In [10]:
# SHOOTING FOR
base_url_shoot_for = "https://fbref.com/en/comps/9/{season}/shooting/Premier-League-Stats"
attrs_shoot_for = {"id": "stats_squads_shooting_for"}

for season in seasons:
    url = base_url_shoot_for.format(season=season)
    df = scrape_team_stats(url, attrs_shoot_for)
    data = clean_and_save_shooting_stats(df)
    if season not in alldata:
        alldata[season] = {}
    alldata[season]["shooting_for"] = data

In [11]:
# SHOOTING  AGAINST
base_url_shoot_against = "https://fbref.com/en/comps/9/{season}/shooting/Premier-League-Stats"
attrs_shoot_against = {"id": "stats_squads_shooting_against"}

for season in seasons:
    url = base_url_shoot_against.format(season=season)
    df = scrape_team_stats(url, attrs_shoot_against)
    data = clean_and_save_shooting_stats(df)
    if season not in alldata:
        alldata[season] = {}
    alldata[season]["shooting_against"] = data

In [12]:
# CLEAN ATTACKING STATS
def clean_and_save_attacking_stats(df):
    keep_cols = ["Squad", "SCA90", "GCA90"]
    df = df[keep_cols]

    is_against = df["Squad"].astype(str).str.contains("vs ").any()
    if is_against:
        df["Squad"] = df["Squad"].str.replace("vs ", "", regex=False)
        cols = [c for c in df.columns if c != "Squad"]
        df = df.rename(columns={c: c + "_Against" for c in cols})
    rename_cols = {
        "SCA90": "Shot_Creating_Actions_Per90",
        "GCA90": "Goal_Creating_Actions_Per90"
    }

    df = df.rename(columns=rename_cols)
    return df

In [13]:
# ATTACKING FOR
base_url_att_for = "https://fbref.com/en/comps/9/{season}/gca/Premier-League-Stats"
attrs_attacking_for = {"id": "stats_squads_gca_for"}

for season in seasons:
    url = base_url_att_for.format(season=season)
    df = scrape_team_stats(url, attrs_attacking_for)
    data = clean_and_save_attacking_stats(df)
    if season not in alldata:
        alldata[season] = {}
    alldata[season]["attacking_for"] = data

In [14]:
# ATTACKING AGAINST
base_url_att_against = "https://fbref.com/en/comps/9/{season}/gca/Premier-League-Stats"
attrs_attacking_against = {"id": "stats_squads_gca_against"}

for season in seasons:
    url = base_url_att_against.format(season=season)
    df = scrape_team_stats(url, attrs_attacking_against)
    data = clean_and_save_attacking_stats(df)
    if season not in alldata:
        alldata[season] = {}
    alldata[season]["attacking_against"] = data

In [15]:
# FINALLY MAKE A SQUAD STATS TABLE FOR EACH SEASON
merge = merge_squad_player_stats_per_season(alldata)
for season, df in merge.items():
    path = f"../CSV_files/Season_{season}/squad_player_stats.csv"
    df.to_csv(path, index=False)