In [1]:
#!pip install cloudscraper

In [2]:
# IGNORE FUTURE WARNINGS
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import cloudscraper
import pandas as pd

In [4]:
# SCRAPE A SEASONS FIXTURES TABLE
def scrape_fixtures_table(url, attrs):
    scraper = cloudscraper.create_scraper()

    html = scraper.get(url).text
    tables = pd.read_html(html, attrs=attrs)
    
    return tables[0]

In [5]:
# CLEAN AND SAVE A SEASONS FIXTURES TABLE
def clean_and_save_fixtures(df, output_path):
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    today = pd.Timestamp.today().normalize()
    df = df[df["Date"] <= today].copy()

    if "Score" in df.columns:
        df["Score"] = df["Score"].astype(str).str.replace("â€“", "-", regex=False)

        goals = df["Score"].str.extract(r'(\d+)\s*-\s*(\d+)', expand=True)

        df["Home_Goals"] = (
            pd.to_numeric(goals[0], errors="coerce")
            .astype("Int64")
        )
        df["Away_Goals"] = (
            pd.to_numeric(goals[1], errors="coerce")
            .astype("Int64")
        )

    drop_cols = ["Wk","Day","Time","Score","Attendance","Venue",
                 "Referee","Match Report","Notes"]
    drop_cols = [c for c in drop_cols if c in df.columns]
    df = df.drop(columns=drop_cols)
    rename_cols = {"xG": "Home_xG", "xG.1": "Away_xG", "Home": "Home_Team", "Away": "Away_team"}
    df = df.rename(columns=rename_cols)

    df.to_csv(output_path, index=False)
    return df

In [6]:
# ARRAY WITH SEASONS, LINKS TO THE TABLES AND TABLE IDS
# METHODS CALLOUTS
fixtures = [
    ("2020-2021",
     "https://fbref.com/en/comps/9/2020-2021/schedule/2020-2021-Premier-League-Scores-and-Fixtures",
     "sched_2020-2021_9_1"),
    
    ("2021-2022",
     "https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures",
     "sched_2021-2022_9_1"),

    ("2022-2023",
     "https://fbref.com/en/comps/9/2022-2023/schedule/2022-2023-Premier-League-Scores-and-Fixtures",
     "sched_2022-2023_9_1"),

    ("2023-2024",
     "https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures",
     "sched_2023-2024_9_1"),

    ("2024-2025",
     "https://fbref.com/en/comps/9/2024-2025/schedule/2024-2025-Premier-League-Scores-and-Fixtures",
     "sched_2024-2025_9_1"),

    ("2025-2026",
     "https://fbref.com/en/comps/9/2025-2026/schedule/2025-2026-Premier-League-Scores-and-Fixtures",
     "sched_2025-2026_9_1")
]


for season, url, table_id in fixtures:
    attrs = {"id": table_id}
    df = scrape_fixtures_table(url, attrs)
    output_path = f"../CSV_files/Season_{season}/{season.replace('-', '')}_fixtures.csv"

    clean_and_save_fixtures(df, output_path)
    print("Saved:", output_path)

Saved: ../CSV_files/Season_2020-2021/20202021_fixtures.csv
Saved: ../CSV_files/Season_2021-2022/20212022_fixtures.csv
Saved: ../CSV_files/Season_2022-2023/20222023_fixtures.csv
Saved: ../CSV_files/Season_2023-2024/20232024_fixtures.csv
Saved: ../CSV_files/Season_2024-2025/20242025_fixtures.csv
Saved: ../CSV_files/Season_2025-2026/20252026_fixtures.csv
