In [128]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [130]:
url = 'https://en.wikipedia.org/wiki/2024%E2%80%9325_Premier_League'
response = requests.get(url)

In [140]:
def parse_results_table_flat(table, *, numeric=False, strip_fc=False):
    """
    Parse a PL home/away matrix from a <table> Tag.
    Returns a flat list:
      matches = [{home, away, score}] or if numeric=True -> [{home, away, home_goals, away_goals}]
    """
    assert getattr(table, "name", None) == "table", "Pass a <table> Tag"

    def clean_team(name: str) -> str:
        name = " ".join(name.split())
        if strip_fc:
            name = name.replace(" F.C.", "")
        return name

    # Header (away teams)
    header_tr = table.find("thead").find("tr") if table.find("thead") else table.find("tr")
    ths = header_tr.find_all("th")[1:]  # skip 'Home \\ Away'
    away_teams = []
    for th in ths:
        a = th.find("a")
        if not a: 
            continue
        away_teams.append(clean_team(a.get("title") or a.get_text(strip=True)))

    # Body rows
    tbody = table.find("tbody") or table
    rows = tbody.find_all("tr")
    if rows and rows[0] is header_tr:
        rows = rows[1:]

    matches = []
    for tr in rows:
        th = tr.find("th")
        if not th or not th.find("a"):
            continue
        home = clean_team(th.find("a").get("title") or th.find("a").get_text(strip=True))

        for away, td in zip(away_teams, tr.find_all("td")):
            raw = td.get_text(" ", strip=True)
            if not raw:
                continue
            score_txt = raw.replace("–", "-").replace("—", "-").replace("−", "-")
            if score_txt == "-":  # diagonal blank
                continue

            if numeric:
                parts = score_txt.split("-")
                if len(parts) != 2 or not all(p.strip().isdigit() for p in parts):
                    continue
                matches.append({
                    "home": home, "away": away,
                    "home_goals": int(parts[0]), "away_goals": int(parts[1])
                })
            else:
                matches.append({"home": home, "away": away, "score": score_txt})

    return matches


In [146]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    # you already picked the sixth table:
    result_table = soup.find_all("table")[5]
    matches = parse_results_table_flat(result_table)

df = pd.DataFrame(matches)
print(df.head())
df.to_csv("premier_league_results.csv", index=False, encoding="utf-8")



        

           home                         away score
0  Arsenal F.C.             Aston Villa F.C.   2-2
1  Arsenal F.C.              AFC Bournemouth   1-2
2  Arsenal F.C.               Brentford F.C.   1-1
3  Arsenal F.C.  Brighton & Hove Albion F.C.   1-1
4  Arsenal F.C.                 Chelsea F.C.   1-0
