In [None]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from random import uniform

In [None]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
years = list(range(2025, 2020, -1))
all_matches = []

In [None]:
# Loop through each season
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text, 'html.parser')

    # Extract links to each team's page
    standings_table = soup.select('table.stats_table')[0]
    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    # Find link to the previous season
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    # Loop through each team
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text, 'html.parser')

        # Find relevant links for Shooting, Goalkeeping, Passing, etc.
        links = [l.get("href") for l in soup.find_all('a')]
        links_dict = {
            "Shooting": [l for l in links if l and 'all_comps/shooting/' in l],
            "Goalkeeping": [l for l in links if l and 'all_comps/keeper/' in l],
            "Passing": [l for l in links if l and 'all_comps/passing/' in l],
            "Goal and Shot Creation": [l for l in links if l and 'all_comps/gca/' in l],
            "Defensive Actions": [l for l in links if l and 'all_comps/defense/' in l]
        }

        # Create a function to load and merge data
        def load_stats_data(url, match):
            if url:
                data = requests.get(f"https://fbref.com{url[0]}")
                stat_table = pd.read_html(data.text)[0]
                stat_table.columns = stat_table.columns.droplevel()  # Drop multi-level column headers
                try:
                    return match.merge(stat_table, on="Date")
                except ValueError:
                    return match
            return match

        # Merge the Shooting table
        matches = load_stats_data(links_dict["Shooting"], matches)

        # Merge the Goalkeeping table
        matches = load_stats_data(links_dict["Goalkeeping"], matches)

        # Merge the Passing table
        matches = load_stats_data(links_dict["Passing"], matches)

        # Merge the Goal and Shot Creation table
        matches = load_stats_data(links_dict["Goal and Shot Creation"], matches)

        # Merge the Defensive Actions table
        matches = load_stats_data(links_dict["Defensive Actions"], matches)

        # Filter for Premier League matches only
        team_data = matches[matches["Comp"] == "Premier League"]

        # Add additional columns for season and team
        team_data["Season"] = year
        team_data["Team"] = team_name

        # Append the team's data to the list
        all_matches.append(team_data)

        # Pause to avoid overwhelming the server
        time.sleep(uniform(1, 180))  # Sleep for a random time between 1 and 55 seconds

# Combine all matches into a single DataFrame
final_df = pd.concat(all_matches, ignore_index=True)

In [None]:
final_df.to_csv('dataset.csv', index=False)