In this notebook, I will leverage the [Sport Reference](https://www.sports-reference.com) network, a robust collection of websites dedicated to providing statistical data, analysis, and historical insights across a range of sports. Specifically, I will focus on retrieving football data from their platform, ensuring compliance with their rate limiting policy as outlined on their [bot-traffic page](https://www.sports-reference.com/bot-traffic.html). This implementation will involve careful management of request frequency to respect their guidelines while extracting valuable statistical information for analysis.

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import Union, List
import time
import logging

In [None]:
# Clear global logging handlers
logging.getLogger().handlers.clear()
# Configure a logger for the main script
main_logger = logging.getLogger(__name__)
main_logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('[%(levelname)s] (%(name)s) : %(message)s')
handler.setFormatter(formatter)
main_logger.addHandler(handler)

# Create a class for scraping data
class Scraper:
    instance_created = False

    def __init__(self, force_new_instance=False):
        if Scraper.instance_created and not force_new_instance:
            raise Exception("An instance of Scraper has already been created.")
        Scraper.instance_created = True

        self.start_time = time.time()  # Start time of the class instance
        self.call_count = 0  # Counter for the number of URL calls
        self.max_calls_per_minute = 9  # Limit of calls allowed per minute
        self.last_reset_time = time.time()  # Last time the call counter was reset
        self.configure_logging()  # Configure logging
        self.logger.info("Scraper initialized.")


    def configure_logging(self):
        """Configures the logger for the Scraper class."""
        self.logger = logging.getLogger("Scrape")
        self.logger.setLevel(logging.INFO)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('[%(levelname)s] (%(name)s.%(funcName)s) : %(message)s')
        handler.setFormatter(formatter)
        # Add the handler to the logger
        if not self.logger.hasHandlers():  # Check if there are already handlers to avoid duplication
            self.logger.addHandler(handler)


    def _reset_call_count_if_needed(self):
        """Resets the call count if a minute has passed since the last reset."""
        current_time = time.time()
        if current_time - self.last_reset_time > 58:
            self.call_count = 0
            self.last_reset_time = current_time
            self.logger.info("Resetting call count.")


    def fetch_html_content(self, url: str) -> Union[bytes, None]:
        """
        Fetch HTML content from a given URL with retry mechanism for error 429.

        Parameters:
        - url (str): The URL to request.

        Returns:
        - bytes (raw HTML content) if the request is successful.
        - None if the request fails.
        """
        self._reset_call_count_if_needed()

        # Wait if the maximum calls per minute limit is reached
        if self.call_count >= self.max_calls_per_minute:
            wait_time = 62 - (time.time() - self.last_reset_time)
            self.logger.warning(f"Maximum call limit reached. Waiting for {wait_time:.2f} seconds.")
            time.sleep(wait_time)  # Wait until the limit resets
            self.call_count = 0  # Reset after waiting
            self.last_reset_time = time.time()
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raises an error if the request fails
            self.call_count += 1  # Increment call count
            self.logger.info(f"Fetching HTML content from {url}. Call count: {self.call_count}.")
            return response.content

        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:  # Too Many Requests error
                raise SystemExit(f"*** Request failed with 429 error -> session in jail for up to a day. ***")
            else:
                raise SystemExit(f"*** Request failed: {e} ***")


    def get_team_urls(self, response_content: bytes) -> List[str]:
        """
        Retrieves the URLs for team pages of a specific Premier League season from 'https://fbref.com/en/comps'.

        Parameters:
        - response_content (bytes): The HTML content of the season standings page.

        Returns:
        - list of str: Full URLs linking to each team's squad page for the season.
        """
        soup = BeautifulSoup(response_content, 'html.parser')
        standings_table = soup.find('table', class_='stats_table')
        if not standings_table:
            self.logger.warning("Standings table not found on the page.")
            return []

        # Extract links related to teams
        team_links = [
            f"https://fbref.com{a.get('href')}"
            for a in standings_table.find_all('a', href=True)
            if '/squads/' in a.get('href', '')
        ]
        return team_links


    def extract_table_from_html(self, html_content: bytes, table_match: str) -> pd.DataFrame:
        """
        Extracts a table from HTML content based on a specific match string.

        Parameters:
        - html_content (bytes): HTML content.
        - table_match (str): The string to match the desired table.

        Returns:
        - pd.DataFrame: DataFrame containing the table data if found, or an empty DataFrame.
        """
        try:
            table = pd.read_html(html_content, match=table_match)[0]
            self.logger.info(f"Found table '{table_match}'.")
            return table
        except ValueError:
            self.logger.warning(f"Table matching '{table_match}' not found.")
            return pd.DataFrame()  # Return an empty DataFrame if no table matches


    def get_shooting_data(self, response_content: bytes) -> pd.DataFrame:
        """
        Extracts shooting data for a specific team during a season from the provided HTML content.

        Parameters:
        - response_content (bytes): The HTML content of the team’s page containing team data.

        Returns:
        - pd.DataFrame: DataFrame containing the shooting data of a team in a specific season if successful, or an empty DataFrame if there was an error.
        """
        soup = BeautifulSoup(response_content, 'html.parser')
        # Extract shooting link related to the team
        shooting_links = [
            f"https://fbref.com{a.get('href')}"
            for a in soup.find_all('a', href=True)
            if 'all_comps/shooting/' in a.get('href', '')
        ]

        if not shooting_links:
            self.logger.warning("Shooting data link not found.")
            return pd.DataFrame()

        # Fetch and extract shooting data
        shooting_html = self.fetch_html_content(shooting_links[0])
        if shooting_html:
            return self.extract_table_from_html(shooting_html, "Shooting")
        else:
            return pd.DataFrame()


In [None]:
scraper = Scraper()

standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
standings_html = scraper.fetch_html_content(standings_url)
team_urls = scraper.get_team_urls(standings_html)

team_url = team_urls[0]
team_html = scraper.fetch_html_content(team_url)
match_data = scraper.extract_table_from_html(team_html, "Scores & Fixtures")
shooting_data = scraper.get_shooting_data(team_html)

print("\n'match_data' shape:", match_data.shape)
display(match_data.head(3))
print("\n'shooting_data' shape:", shooting_data.shape)
display(shooting_data.head(3))

[INFO] (Scrape.__init__) : Scraper initialized.
[INFO] (Scrape.fetch_html_content) : Fetching HTML content from https://fbref.com/en/comps/9/Premier-League-Stats. Call count: 1.
[INFO] (Scrape.fetch_html_content) : Fetching HTML content from https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats. Call count: 2.
[INFO] (Scrape.extract_table_from_html) : Found table 'Scores & Fixtures'.
[INFO] (Scrape.fetch_html_content) : Fetching HTML content from https://fbref.com/en/squads/b8fd03ef/2024-2025/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions. Call count: 3.
[INFO] (Scrape.extract_table_from_html) : Found table 'Shooting'.



'match_data' shape: (49, 20)


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee,Match Report,Notes
0,2024-08-10,15:00,FA Community Shield,FA Community Shield,Sat,Home,D,1 (7),1 (6),Manchester Utd,,,56.0,78146.0,Rúben Dias,4-3-3,4-2-3-1,Jarred Gillett,Match Report,Manchester City won on penalty kicks following...
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,0.8,1.0,52.0,39818.0,Kevin De Bruyne,3-2-4-1,4-2-3-1,Anthony Taylor,Match Report,
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,3.3,0.3,75.0,53147.0,Kevin De Bruyne,3-2-4-1,5-4-1,Samuel Allison,Match Report,



'shooting_data' shape: (15, 26)


Unnamed: 0_level_0,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,Gls,Sh,SoT,SoT%,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2024-08-10,15:00,FA Community Shield,FA Community Shield,Sat,Home,D,1 (7),1 (6),Manchester Utd,1,9,1,11.1,0.11,1.0,,,0,0,,,,,,Match Report
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,2,11,5,45.5,0.18,0.4,19.1,0.0,0,0,0.8,0.8,0.07,1.2,1.2,Match Report
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,4,13,4,30.8,0.23,0.75,17.8,1.0,1,1,3.3,2.6,0.2,0.7,0.4,Match Report


We can observe two things from this dataset:


1.   `shooting_data` has multiple levels of column names, and the outer level can be removed.
2.   The shapes differ: `shooting_data` doesn't include all matches.



In [None]:
# Remove the outer level
shooting_data.columns = shooting_data.columns.droplevel()

shooting_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,Gls,Sh,SoT,SoT%,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2024-08-10,15:00,FA Community Shield,FA Community Shield,Sat,Home,D,1 (7),1 (6),Manchester Utd,1,9,1,11.1,0.11,1.0,,,0,0,,,,,,Match Report
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,2,11,5,45.5,0.18,0.4,19.1,0.0,0,0,0.8,0.8,0.07,1.2,1.2,Match Report
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,4,13,4,30.8,0.23,0.75,17.8,1.0,1,1,3.3,2.6,0.2,0.7,0.4,Match Report
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3,1,West Ham,3,23,8,34.8,0.13,0.38,15.0,1.0,0,0,3.0,3.0,0.13,0.0,0.0,Match Report
4,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,W,2,1,Brentford,2,18,7,38.9,0.11,0.29,17.2,0.0,0,0,2.1,2.1,0.12,-0.1,-0.1,Match Report


In [None]:
# Combine 'match_data' with `shooting_data` mantaining only specific stats
team_data = match_data.merge(shooting_data[['Date', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK', 'PK',
                                          'PKatt', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG']], on='Date')

print("'team_data' shape:", team_data.shape)

team_data.head()

'team_data' shape: (14, 33)


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee,Match Report,Notes,Sh,SoT,SoT%,G/Sh,G/SoT,Dist,FK,PK,PKatt,npxG,npxG/Sh,G-xG,np:G-xG
0,2024-08-10,15:00,FA Community Shield,FA Community Shield,Sat,Home,D,1 (7),1 (6),Manchester Utd,,,56.0,78146.0,Rúben Dias,4-3-3,4-2-3-1,Jarred Gillett,Match Report,Manchester City won on penalty kicks following...,9,1,11.1,0.11,1.0,,,0,0,,,,
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,Chelsea,0.8,1.0,52.0,39818.0,Kevin De Bruyne,3-2-4-1,4-2-3-1,Anthony Taylor,Match Report,,11,5,45.5,0.18,0.4,19.1,0.0,0,0,0.8,0.07,1.2,1.2
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4,1,Ipswich Town,3.3,0.3,75.0,53147.0,Kevin De Bruyne,3-2-4-1,5-4-1,Samuel Allison,Match Report,,13,4,30.8,0.23,0.75,17.8,1.0,1,1,2.6,0.2,0.7,0.4
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3,1,West Ham,3.0,0.7,67.0,62469.0,Kevin De Bruyne,3-2-4-1,4-2-3-1,Michael Oliver,Match Report,,23,8,34.8,0.13,0.38,15.0,1.0,0,0,3.0,0.13,0.0,0.0
4,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,W,2,1,Brentford,2.1,1.0,54.0,52148.0,Kyle Walker,4-2-3-1,5-3-2,Darren Bond,Match Report,,18,7,38.9,0.11,0.29,17.2,0.0,0,0,2.1,0.12,-0.1,-0.1


In [None]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

years = list(range(2024, 2019, -1))
all_matches = []

for year in years:
    main_logger.info(f"--- Season {year} ---")
    standings_html = scraper.fetch_html_content(standings_url)

    # Update standings_url with the previous season's link
    soup = BeautifulSoup(standings_html, 'html.parser')
    previous_season = soup.find("a", attrs={"class": "button2 prev"}).get("href")
    standings_url = f"https://fbref.com{previous_season}"

    # Find team URLs for the season
    team_urls = scraper.get_team_urls(standings_html)

    # Extract stats from each team
    for team_url in team_urls:
        # Save the team name from the URL
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        main_logger.info(f"-- {team_name} --")

        # Get match stats of the team
        team_html = scraper.fetch_html_content(team_url)
        match_data = scraper.extract_table_from_html(team_html, "Scores & Fixtures")

        # Get shooting stats of the team
        shooting_data = scraper.get_shooting_data(team_html)
        shooting_data.columns = shooting_data.columns.droplevel()

        # For some teams there aren't shooting stats
        if shooting_data.empty:
            main_logger.warning(f"{team_name} doesn't have shooting data in {year} season")
            continue

        try:
            # Merge match data with shooting data
            team_data = match_data.merge(shooting_data[['Date', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK', 'PK',
                                                         'PKatt', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG']], on='Date')
        except Exception as e:
            # Log an error if there is an issue merging the data
            main_logger.error(f"Error combining {team_name} 'match_data' and 'shooting_data' for {year} season: {e}")
            continue
        # Add team and season information to the data
        team_data["team"] = team_name
        team_data["season"] = year
        all_matches.append(team_data)  # Append the team data to the list
        main_logger.info(f"Extracted data for {team_name} during season {year}.")
    # Save checkpoint
    checkpoint_df = pd.concat(all_matches)
    checkpoint_df.to_csv(f"checkpoint_season{year}.csv", index=False)

match_df = pd.concat(all_matches)

match_df.to_csv(f"Matches_{max(years)}-{min(years)}.csv", index=False)

[INFO] (__main__) : --- Season 2024 ---
[INFO] (Scrape.fetch_html_content) : Fetching HTML content from https://fbref.com/en/comps/9/Premier-League-Stats. Call count: 4.
[INFO] (__main__) : -- Manchester City --
[INFO] (Scrape.fetch_html_content) : Fetching HTML content from https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats. Call count: 5.
[INFO] (Scrape.extract_table_from_html) : Found table 'Scores & Fixtures'.
[INFO] (Scrape.fetch_html_content) : Fetching HTML content from https://fbref.com/en/squads/b8fd03ef/2024-2025/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions. Call count: 6.
[INFO] (Scrape.extract_table_from_html) : Found table 'Shooting'.
[INFO] (__main__) : Extracted data for Manchester City during season 2024.
[INFO] (__main__) : -- Liverpool --
[INFO] (Scrape.fetch_html_content) : Fetching HTML content from https://fbref.com/en/squads/822bd0ba/Liverpool-Stats. Call count: 7.
[INFO] (Scrape.extract_table_from_html) : Found table 'Scores & 