In [3]:
import pandas as pd
match_df = pd.concat(all_matches) #Merge all data frames into a single data frame
match_df.columns = [c.lower() for c in match_df.columns] #make column names lowercase
match_df.to_csv('matches.csv')

In [1]:
import time
import random
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Example variables; adjust for your use case
years = [2021, 2020, 2019]  # or whatever years you need
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"  # starting URL
all_matches = []  # list to accumulate all match data

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/91.0.4472.124 Safari/537.36"
    ),
}

for year in years:
    try:
        print(f"Requesting URL: {standings_url}")

        # Make the request without a proxy
        response = requests.get(standings_url, headers=headers)
        print(f"Status Code: {response.status_code}")

        # Handle rate limiting explicitly
        if response.status_code == 429:
            print("Rate-limited. Retrying after 5 minutes...")
            time.sleep(300)  # Wait 5 minutes before retrying
            continue

        # If not 200, skip this iteration
        if response.status_code != 200:
            print(f"Request failed with status {response.status_code}. Skipping year {year}.")
            continue

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table(s)
        tables = soup.select('table.stats_table')
        if not tables:
            print("No standings table found! Skipping...")
            continue

        # Now it's safe to index [0]
        standings_table = tables[0]

        # -----------------------------------------
        # Process the standings table...
        # -----------------------------------------

        # Wait randomly between 10 and 20 seconds
        time.sleep(random.randint(10, 20))

        # Collect teams from the table
        links = [l.get("href") for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]

        # Find the 'previous season' URL
        # Make sure this link actually exists before accessing [0]
        prev_season_links = soup.select("a.prev")
        if not prev_season_links:
            print("No previous season link found. (Reached the earliest season?)")
            break

        previous_season = prev_season_links[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"

        # For each team, grab the matches and shooting data
        for team_url in team_urls:
            team_name = team_url.split("/")[-1].replace("-Stats", "")
            print(f"  -> Fetching data for {team_name}...")

            # Request the team page
            try:
                data_response = requests.get(team_url, headers=headers)
                if data_response.status_code != 200:
                    print(f"    Request failed for {team_name}: {data_response.status_code}")
                    continue

                # Parse matches
                matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]

                # Now parse shooting page
                soup_team = BeautifulSoup(data_response.text, 'html.parser')
                shooting_links = [l.get("href") for l in soup_team.find_all('a') if l.get("href")]
                shooting_links = [l for l in shooting_links if 'all_comps/shooting/' in l]

                if not shooting_links:
                    print(f"    No shooting link found for {team_name}. Skipping.")
                    continue

                shooting_url = f"https://fbref.com{shooting_links[0]}"
                shooting_response = requests.get(shooting_url, headers=headers)
                if shooting_response.status_code != 200:
                    print(f"    Failed to get shooting data for {team_name}")
                    continue

                # Parse shooting table
                shooting = pd.read_html(shooting_response.text, match="Shooting")[0]
                # Drop the multi-level column if it exists
                shooting.columns = shooting.columns.droplevel()

                # Merge matches with shooting data on Date
                try:
                    team_data = matches.merge(
                        shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]],
                        on="Date"
                    )
                except ValueError:
                    print(f"    Merge failed for {team_name} due to mismatched columns. Skipping.")
                    continue

                # Filter to the competition you want
                team_data = team_data[team_data["Comp"] == "Premier League"]
                team_data["Season"] = year
                team_data["Team"] = team_name

                # Append to your master list
                all_matches.append(team_data)

                # Random delay between each team
                time.sleep(random.randint(10, 20))

            except Exception as e:
                print(f"  -> An error occurred fetching data for {team_name}: {e}")

    except Exception as e:
        print(f"An error occurred: {e}")
        continue

# After the loop, `all_matches` should contain all the data you collected.
if all_matches:
    final_df = pd.concat(all_matches, ignore_index=True)
    print("Final data shape:", final_df.shape)
    # final_df.to_csv("premier_league_data.csv", index=False)
else:
    print("No data collected.")

Requesting URL: https://fbref.com/en/comps/9/Premier-League-Stats
Status Code: 200
  -> Fetching data for Liverpool...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Arsenal...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Nottingham-Forest...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Chelsea...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Newcastle-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Manchester-City...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Bournemouth...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Aston-Villa...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Fulham...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Brighton-and-Hove-Albion...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Brentford...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Tottenham-Hotspur...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for West-Ham-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Manchester-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Crystal-Palace...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Everton...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Wolverhampton-Wanderers...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Ipswich-Town...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Leicester-City...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Southampton...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


Requesting URL: https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats
Status Code: 200
  -> Fetching data for Manchester-City...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Arsenal...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Liverpool...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Aston-Villa...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Tottenham-Hotspur...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Chelsea...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Newcastle-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Manchester-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for West-Ham-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Crystal-Palace...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Brighton-and-Hove-Albion...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Bournemouth...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Fulham...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Wolverhampton-Wanderers...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Everton...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Brentford...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Nottingham-Forest...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Luton-Town...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Burnley...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Sheffield-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


Requesting URL: https://fbref.com/en/comps/9/2022-2023/2022-2023-Premier-League-Stats
Status Code: 200
  -> Fetching data for Manchester-City...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Arsenal...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Manchester-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Newcastle-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Liverpool...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Brighton-and-Hove-Albion...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Aston-Villa...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Tottenham-Hotspur...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Brentford...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Fulham...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Crystal-Palace...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Chelsea...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Wolverhampton-Wanderers...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for West-Ham-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Bournemouth...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Nottingham-Forest...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Everton...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Leicester-City...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Leeds-United...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


  -> Fetching data for Southampton...


  matches = pd.read_html(data_response.text, match="Scores & Fixtures")[0]
  shooting = pd.read_html(shooting_response.text, match="Shooting")[0]


Final data shape: (1912, 28)
