# Automatizing new matches downloads

In [32]:
import pandas as pd
from bs4 import BeautifulSoup, Tag
import requests
from io import StringIO
import time
import re
import os
import numpy as np

### Obtaining match data from the website

In [33]:
class Match_data:
    def __init__(self, url):
        """
        Initialize the class with the given URL.
        
        Args:
            url (str): The URL to scrape match data from.
        """
        self.url = url
        self.links = []  # List to store match links
        self.gameweeks = []  # List to store gameweek data
    
    def get_links(self):
        """
        Get the match report links from the provided URL.
        
        This function scrapes the provided URL for match report links and stores them in the 'links' list.
        
        Returns:
            pd.DataFrame: DataFrame containing the match report links.
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",  # Indica el idioma preferido
            "Accept-Encoding": "gzip, deflate, br",  # Indica que aceptas respuestas comprimidas
            "Connection": "keep-alive",  # Mantiene la conexión abierta para mayor eficiencia
            "Upgrade-Insecure-Requests": "1",  # Indica que el cliente prefiere HTTPS
            "DNT": "1"  # Indica que no deseas ser rastreado (opcional)
        }
        response = requests.get(self.url, headers=headers)  # Send HTTP request to the URL
        soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content

        # Find all cells containing match report links
        report_cells = soup.find_all('td', {'data-stat': 'match_report'})

        # Loop through each cell to extract the match links
        for idx, cell in enumerate(report_cells):
            link = cell.find('a')  # Find the anchor tag in the cell
            if link:
                url = link['href']  # Extract the link URL
                self.links.append(f'https://fbref.com{url}')  # Append the full URL to the links list

            # Stop after extracting a certain number of links (e.g., 5)
            #if len(self.links) >= 2:
            #    break
            
            time.sleep(6)  # Sleep to avoid making too many requests in a short time

        # Save the links to a DataFrame
        df_links = pd.DataFrame(self.links, columns=['link'])

        return df_links  # Return the cleaned DataFrame of links

    def get_gameweek(self):
        """
        Get the gameweek data from the provided URL.
        
        This function scrapes the provided URL for gameweek data and stores it in the 'gameweeks' list.
        
        Returns:
            pd.DataFrame: DataFrame containing the gameweek data.
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",  # Indica el idioma preferido
            "Accept-Encoding": "gzip, deflate, br",  # Indica que aceptas respuestas comprimidas
            "Connection": "keep-alive",  # Mantiene la conexión abierta para mayor eficiencia
            "Upgrade-Insecure-Requests": "1",  # Indica que el cliente prefiere HTTPS
            "DNT": "1"  # Indica que no deseas ser rastreado (opcional)
        }
        response = requests.get(self.url, headers=headers)  # Send HTTP request to the URL
        soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content

        # Find all cells containing gameweek data
        gameweek_cells = soup.find_all('th', {'data-stat': 'gameweek'})

        # Loop through each cell to extract the gameweek values
        for idx, cell in enumerate(gameweek_cells):
            gameweek_value = cell.text.strip() if cell else 'N/A'  # Extract and clean the gameweek value

            # Skip the 'Sem.' or empty values
            if gameweek_value in ['Sem.', '']:
                continue  # Skip this iteration and move to the next cell

            self.gameweeks.append(gameweek_value)  # Append the value to the gameweeks list

            # Stop after extracting a certain number of valid gameweeks (e.g., 5)
            #if len(self.gameweeks) >= 2:
            #    break

            time.sleep(6)  # Sleep to avoid making too many requests in a short time

        # Create a DataFrame from the gameweeks list
        df_gameweeks = pd.DataFrame(self.gameweeks, columns=['gameweek'])

        # Remove rows where the gameweek is 'Sem.' or empty
        df_gameweeks = df_gameweeks[~df_gameweeks['gameweek'].isin(['Sem.', ''])]

        return df_gameweeks  # Return the cleaned DataFrame of gameweeks

    def create_matches_csv(self):
        """
        Create a CSV file with match data, combining gameweek and match information.
        
        This function combines match details (e.g., teams, date, time) and gameweek data into a single DataFrame.
        
        Returns:
            pd.DataFrame: DataFrame containing match details and gameweek data.
        """
        # Create a DataFrame with empty values for the match data
        df = pd.DataFrame({
            'id': pd.Series([None] * len(self.links)),  # Create a new column with unique IDs for each match
            'date_of_match': pd.Series([None] * len(self.links)),  
            'hour_of_the_match': pd.Series([None] * len(self.links)),  
            'home_team_name': pd.Series([None] * len(self.links)),  
            'away_team_name': pd.Series([None] * len(self.links), dtype='object'),
            'home_trainer': pd.Series([None] * len(self.links), dtype='object'),
            'away_trainer': pd.Series([None] * len(self.links), dtype='object'),
            'stadium': pd.Series([None] * len(self.links), dtype='object'),
            'attendance': pd.Series([None] * len(self.links), dtype='object'),
            'referee': pd.Series([None] * len(self.links), dtype='object'),
            'var': pd.Series([None] * len(self.links), dtype='object'),
            'home_team_lineup': pd.Series([None] * len(self.links), dtype='object'),
            'away_team_lineup': pd.Series([None] * len(self.links), dtype='object'),
            'home_possession': pd.Series([None] * len(self.links), dtype='object'),
            'away_possession': pd.Series([None] * len(self.links), dtype='object')
        })

        # Ensure that the 'date_of_match' column is converted to datetime format
        df['date_of_match'] = pd.to_datetime(df['date_of_match'], format='%Y-%m-%d', errors='coerce') 
        # Ensure that the 'hour_of_the_match' column is converted to string format
        df['hour_of_the_match'] = df['hour_of_the_match'].astype(str)

        # Ensure that other columns are of the correct data type (string)
        df['home_team_name'] = df['home_team_name'].astype(object)
        df['away_team_name'] = df['away_team_name'].astype(str)
        df['home_trainer'] = df['home_trainer'].astype(str)
        df['away_trainer'] = df['away_trainer'].astype(str)
        df['stadium'] = df['stadium'].astype(str)
        df['attendance'] = df['attendance'].astype(str)
        df['referee'] = df['referee'].astype(str)
        df['var'] = df['var'].astype(str)
        df['home_team_lineup'] = df['home_team_lineup'].astype(str)
        df['away_team_lineup'] = df['away_team_lineup'].astype(str)
        df['home_possession'] = df['home_possession'].astype(str)
        df['away_possession'] = df['away_possession'].astype(str)

        # Get the links data
        links = self.get_links()

        # Get the gameweek data
        gameweeks = self.get_gameweek()

        # Create a DataFrame from the gameweek data and links data
        df_gameweeks = pd.DataFrame(gameweeks, columns=['gameweek'])
        df_links = pd.DataFrame(links, columns=['link'])

        df_final = pd.concat([gameweeks, df], axis=1)
        df_final = pd.concat([df_final, df_links], axis=1)

        # Assign the final DataFrame to the class attribute with the new name
        self.df_final = df_final  

        return self.df_final  # Return the DataFrame to be used later

    def get_statistics(self):
        """
        Get match statistics from the links and save them to a CSV file.
        
        This function extracts detailed statistics (e.g., team lineups, referee, attendance) from the match links.
        
        Returns:
            pd.DataFrame: DataFrame containing match statistics.
        """
        # Directly use the self.df DataFrame that contains the match and link data
        
        links = self.df_final['link'].tolist()  

        id = 0

        # Loop through each match link and extract the statistics
        for idx, link in enumerate(links):
            try:
                print(f"Processing link {idx + 1}: {link}")
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.5",  # Indica el idioma preferido
                    "Accept-Encoding": "gzip, deflate, br",  # Indica que aceptas respuestas comprimidas
                    "Connection": "keep-alive",  # Mantiene la conexión abierta para mayor eficiencia
                    "Upgrade-Insecure-Requests": "1",  # Indica que el cliente prefiere HTTPS
                    "DNT": "1"  # Indica que no deseas ser rastreado (opcional)
                }
                response = requests.get(link, headers=headers)
                soup = BeautifulSoup(response.text, 'html.parser')

                # Initialize default values for match statistics
                match_date = "Unknown"
                match_time = "Unknown"
                home_team_name = "Unknown"
                away_team_name = "Unknown"
                home_trainer = "Unknown"
                away_trainer = "Unknown"
                stadium = "Unknown"
                attendance = "Unknown"
                referee = "Unknown"
                var = "Unknown"
                home_team_lineup = "Unknown"
                away_team_lineup = "Unknown"
                home_possession = "Unknown"
                away_possession = "Unknown"

                # ID for each match
                id += 1
                self.df_final.at[idx, 'id'] = id

                # Extract match date and time
                date_element = soup.find('span', {'class': 'venuetime'})
                if date_element:
                    match_date = date_element.get('data-venue-date', "Unknown")
                    match_time = date_element.get('data-venue-time', "Unknown")
                self.df_final.at[idx, 'date_of_match'] = match_date
                self.df_final.at[idx, 'hour_of_the_match'] = match_time

                # Extract team names
                teams_elements = soup.find_all('span', class_='teamandlogo')
                if len(teams_elements) >= 2:
                    home_team_name = teams_elements[0].text.strip()
                    away_team_name = teams_elements[1].text.strip()
                self.df_final.at[idx, 'home_team_name'] = home_team_name
                self.df_final.at[idx, 'away_team_name'] = away_team_name

                # Extract trainers (coaches) names
                trainers_elements = soup.find_all('div', class_='datapoint')
                trainer_count = 0
                for trainer_element in trainers_elements:
                    if 'Director Técnico' in trainer_element.text:
                        trainer_name = trainer_element.text.split(':')[-1].strip()
                        if trainer_count == 0:
                            home_trainer = trainer_name
                            trainer_count += 1
                        elif trainer_count == 1:
                            away_trainer = trainer_name
                            trainer_count += 1
                self.df_final.at[idx, 'home_trainer'] = home_trainer
                self.df_final.at[idx, 'away_trainer'] = away_trainer

                # Extract stadium information
                stadium_element = soup.find('div', class_='scorebox_meta')
                if stadium_element:
                    stadium_info = stadium_element.find('strong', string="Sedes")
                    if stadium_info:
                        stadium = stadium_info.find_next('small').find_next('small').text.strip()
                self.df_final.at[idx, 'stadium'] = stadium

                # Extract attendance information
                attendance_element = soup.find('div', class_='scorebox_meta')
                if attendance_element:
                    attendance_info = attendance_element.find('strong', string="Asistencia")
                    if attendance_info:
                        attendance = attendance_info.find_next('small').find_next('small').text.strip()
                        try:
                            attendance = int(attendance.replace(',', '').replace('.', ''))
                        except ValueError:
                            attendance = None
                self.df_final.at[idx, 'attendance'] = attendance

                # Extract referee information
                referee_element = soup.find('div', class_='scorebox_meta')
                if referee_element:
                    referee_info = referee_element.find_next('strong', string="Autoridades")
                    if referee_info:
                        referee_span = referee_info.find_next('small').find_next('small').find('span', style="display:inline-block")
                        if referee_span:
                            referee = referee_span.text.strip()
                self.df_final.at[idx, 'referee'] = referee

                # Extract VAR information
                var_element = soup.find('div', class_='scorebox_meta')
                if var_element:
                    var_info = var_element.find_next('strong', string="Autoridades")
                    if var_info:
                        var_span = var_info.find_next('small').find_next('small').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span')
                        if var_span:
                            var = var_span.text.strip()
                self.df_final.at[idx, 'var'] = var

                # Extract team lineups
                lineup_elements = soup.find_all('th', string=lambda text: text and '(' in text and ')' in text)
                if len(lineup_elements) >= 1:
                    home_match = re.search(r'\((.*?)\)', lineup_elements[0].text)
                    if home_match:
                        home_team_lineup = home_match.group(1)
                if len(lineup_elements) >= 2:
                    away_match = re.search(r'\((.*?)\)', lineup_elements[1].text)
                    if away_match:
                        away_team_lineup = away_match.group(1)
                self.df_final.at[idx, 'home_team_lineup'] = home_team_lineup
                self.df_final.at[idx, 'away_team_lineup'] = away_team_lineup

                # Extract team possession
                # Find the header "Posesión del balón" (Ball Possession)
                possession_header = soup.find('th', string="Posesión del balón")

                # If the header is found, extract the corresponding values
                if possession_header:
                    # Find the <tr> following the header and extract all <strong> elements
                    possession_values = possession_header.find_next('tr').find_all('strong')
                    if len(possession_values) == 2:
                        # Extract and clean the values (remove the '%' symbol)
                        home_possession = possession_values[0].text.strip('%')  # Home team's possession
                        away_possession = possession_values[1].text.strip('%')  # Away team's possession
                    else:
                        print("Possession values not found.")
                else:
                    print("Header 'Posesión del balón' not found.")

                # Remove rows that are completely empty
                self.df_final = self.df_final.dropna(how='all')

                time.sleep(6)  # Sleep to avoid making too many requests in a short time

            except Exception as e:
                print(f"Error processing link {link}: {e}")
                continue

        self.df_matches = self.df_final 
        return self.df_matches

    def save_to_csv(self, season):
        """
        Save the processed data into a CSV file, with the season included in the filename.
        
        Args:
            season (str): The season to be included in the filename.
        """
        # Define the filename with the season
        filename = f'matches_{season}.csv'

        # Save the DataFrame to a CSV file in the parent directory
        self.df_matches.to_csv(filename, index=False)

        # Print confirmation message with the file path
        print(f"File saved as {filename}")

    def run(self, url):
        """
        Execute the full process: get links, get gameweeks, get statistics, and save to CSV files for both teams.
        
        Args:
            url (str): The URL to start scraping the data from.
        """
        print(f"Starting collecting matches data...")

        # Step 1: Extract the season from the URL
        season_match = re.search(r'(\d{4})-(\d{4})', url)
        if season_match:
            season = season_match.group(0)
        else:
            season = 'unknown_season'  # Default value if the season cannot be extracted

        # Step 2: Get all the links to the match pages from the provided URL

        # Step 3: Get the gameweek data from the provided URL

        # Step 4: Create a CSV file with match details, such as teams, dates, and other match-related information
        self.create_matches_csv()

        # Step 5: Retrieve statistics for each match, such as goals, assists, and other relevant data
        df_matches = self.get_statistics()

        # Step 6: Save the processed data into a CSV file with the season name in the filename
        self.save_to_csv(season)

        print(f"Collecting matches data process completed successfully!")


### Obtaining players and keeper data from the website

In [34]:
class Players_data:
    def __init__(self, url, gameweek):
        """
        Initializes the MultiTableExtractor with a URL and gameweek.
        - url: URL of the page to scrape.
        - gameweek: The gameweek number to associate with the data.
        """
        self.url = url  # Store the URL to scrape
        self.gameweek = gameweek  # Store the gameweek number
        self.soup = None  # Initialize BeautifulSoup object as None
        self.teams_data = {}  # Dictionary to store team information

    def fetch_page(self):
        """
        Fetches the web page content from the provided URL and initializes BeautifulSoup.
        Adds a delay to prevent overloading the server.
        """
        # Send a GET request to the URL
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",  # Indica el idioma preferido
            "Accept-Encoding": "gzip, deflate, br",  # Indica que aceptas respuestas comprimidas
            "Connection": "keep-alive",  # Mantiene la conexión abierta para mayor eficiencia
            "Upgrade-Insecure-Requests": "1",  # Indica que el cliente prefiere HTTPS
            "DNT": "1"  # Indica que no deseas ser rastreado (opcional)
        }
        response = requests.get(self.url, headers=headers)

        # Check if the request was successful
        if response.status_code != 200:
            raise Exception(f"Error accessing the page: {response.status_code}")

        # Parse the page content using BeautifulSoup
        self.soup = BeautifulSoup(response.content, "html.parser")

        # Add a delay to avoid overloading the server
        time.sleep(6)

    def extract_teams_ids(self):
        """
        Extracts the IDs and names of the home and away teams using team logos.
        Raises an exception if less than two team logos are found.
        """
        self.fetch_page()  # Fetch the web page content

        # Find all team logos in the page (identified by the 'teamlogo' class)
        team_imgs = self.soup.find_all('img', class_='teamlogo', src=True)

        # Check if at least two team logos are found
        if len(team_imgs) >= 2:
            # Extract team IDs and names from the logos
            self.teams_data = {
                "home": {
                    "id": team_imgs[0]['src'].split('/')[-1].split('.')[0],  # Extract ID from the image URL
                    "name": team_imgs[0]['alt'].replace(" Club Crest", "").replace(" ", "_"),  # Extract team name from the alt attribute
                },
                "away": {
                    "id": team_imgs[1]['src'].split('/')[-1].split('.')[0],
                    "name": team_imgs[1]['alt'].replace(" Club Crest", "").replace(" ", "_"),  
                },
            }
        else:
            # Raise an error if less than two team logos are found
            raise Exception("Not enough team logos found.")

    def extract_players_table(self, team_type, table_type, header_offset, columns_to_drop):
        """
        Extracts a specific player statistics table for the given team and table type.
        - team_type: "home" or "away".
        - table_type: Type of the table (e.g., "summary", "passing").
        - header_offset: Number of header columns to skip.
        - columns_to_drop: List of columns to drop from the table.
        """
        # Get the team ID based on the team type (home or away)
        team_id = self.teams_data[team_type]["id"]

        # Construct the CSS selector for the specific table
        players_table_selector = f"#div_stats_{team_id}_{table_type}"

        # Select the table element using the constructed selector
        table = self.soup.select_one(players_table_selector)

        # Check if the table exists
        if not table:
            raise Exception(f"Payers table {table_type} not found for team {team_type}.")

        # Extract headers from the table, skipping the specified number of columns
        headers = [th.text.strip() for th in table.find("thead").find_all("th")][header_offset:]

        # Extract rows of data from the table body
        rows = [
            [cell.text.strip() for cell in row.find_all(["td", "th"])]
            for row in table.find("tbody").find_all("tr")
        ]

        # Create a DataFrame from the extracted rows and headers
        df = pd.DataFrame(rows, columns=headers)

        # Drop unnecessary columns specified in the columns_to_drop list
        df = df.loc[:, ~df.columns.isin(columns_to_drop)]

        return df

        # Add a delay to avoid overloading the server
        time.sleep(6)

    def process_players_data(self, team_type):
        """
        Processes all player statistics tables for a specific team (home or away).
        Combines data from multiple table types into a dictionary of DataFrames.
        """
        # Define columns to drop for each table type
        columns_to_drop = {
            "summary": ['Ass', 'TP', 'TPint', 'TA', 'TR', 'Toques', 'Tkl', 'Int', 'Bloqueos', 'xG', 'npxG', 'xAG', 'ACT', 'ACG', 'Cmp', 'Int.', '% Cmp', 'PrgP', 'Transportes', 'PrgC', 'Att', 'Succ'],
            "passing": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín'],
            "passing_types": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín', 'Int.', 'Cmp'],
            "defense": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín'],
            "possession": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín', 'Tkld', 'Tkld%'],
            "misc": ['Jugador', 'núm.', 'País', 'Posc', 'Edad', 'Mín', 'Pcz', 'PA', 'Int', 'TklG', 'GC'],
        }

        # Initialize an empty dictionary to store DataFrames for each table type
        tables = {}

        # Loop through each table type and extract its data
        for table_type, header_offset in [
            ("summary", 7), ("passing", 9), ("passing_types", 4),
            ("defense", 5), ("possession", 5), ("misc", 3)
        ]:
            tables[table_type] = self.extract_players_table(
                team_type, table_type, header_offset, columns_to_drop.get(table_type, [])
            )

        return tables

    def save_players_tables(self, match, season):
        """
        Processes and saves player statistics tables for both home and away teams.
        Combines data from all table types and writes the final table to a CSV file.
        """
        # Extract IDs and names of the teams
        self.extract_teams_ids()

        # Process data for both home and away teams
        for team_type in ["home", "away"]:
            # Get the team name
            team_name = self.teams_data[team_type]["name"]

            # Extract and process all player statistics tables for the team
            team_tables = self.process_players_data(team_type)

            # Combine all extracted tables into a single DataFrame
            final_table = pd.concat(team_tables.values(), axis=1)

            # Define the new column names for the dataset with the team_type prefix
            new_columns = [
                f"{team_type}_Players", f"{team_type}_Number", f"{team_type}_Nationality", f"{team_type}_Position", 
                f"{team_type}_PlayersAge", f"{team_type}_PlayersMinutes", f"{team_type}_PlayersGoals", 
                f"{team_type}_PlayersShots", f"{team_type}_PlayersShotsOnTarget", f"{team_type}_PlayersCompletedPasses", 
                f"{team_type}_PlayersAttemptedPasses", f"{team_type}_Players%CompletedPasses", 
                f"{team_type}_PlayersDistancePasses", f"{team_type}_PlayersDistanceProgression", 
                f"{team_type}_PlayersShortPasses", f"{team_type}_PlayersAttemptedShortPasses", 
                f"{team_type}_Players%ShortCompletedPasses", f"{team_type}_PlayersMediumPasses", 
                f"{team_type}_PlayersAttemptedMediumPasses", f"{team_type}_Players%MediumCompletedPasses", 
                f"{team_type}_PlayersLongPasses", f"{team_type}_PlayersAttemptedLongPasses", 
                f"{team_type}_Players%LongCompletedPasses", f"{team_type}_PlayersAssistance", 
                f"{team_type}_PlayersExpectedGoalsAssistance", f"{team_type}_PlayersExpectedAssistance", 
                f"{team_type}_PlayersKeyPasses", f"{team_type}_PlayersLast1/3Passes", f"{team_type}_PlayersGoalAreaPasses", 
                f"{team_type}_PlayersGoalAreaCrosses", f"{team_type}_PlayersGoalPasses", f"{team_type}_PlayersLiveBallPasses", 
                f"{team_type}_PlayersDeadBallPasses", f"{team_type}_PlayersFreeKick", f"{team_type}_PlayersLongPasses", 
                f"{team_type}_PlayersSidePasses", f"{team_type}_PlayersCrosses", f"{team_type}_PlayersStrongcrosses", 
                f"{team_type}_PlayersCorner", f"{team_type}_PlayersCornerIn", f"{team_type}_PlayersCornerOut", 
                f"{team_type}_PlayersCornerRect", f"{team_type}_PlayersOffsidePasses", f"{team_type}_PlayersPassesBlocked", 
                f"{team_type}_PlayersTackles", f"{team_type}_PlayersSuccessfulTackles", f"{team_type}_PlayersTacklesInDefense", 
                f"{team_type}_PlayersTacklesInMedium", f"{team_type}_PlayersTacklesInAttack", f"{team_type}_PlayersDribblerTackles", 
                f"{team_type}_PlayersAttemptedDribblerTackles", f"{team_type}_Players%DribblerTacklesCompleted", 
                f"{team_type}_PlayersDribblerTacklesNonCompleted", f"{team_type}_PlayersBallsBlocked", 
                f"{team_type}_PlayersShotsBlocked", f"{team_type}_PlayersPassesBlocked", f"{team_type}_PlayersInterceptions", 
                f"{team_type}_PlayersTackles+Interceptions", f"{team_type}_PlayersClearances", f"{team_type}_PlayersMistakesRivalShots", 
                f"{team_type}_PlayersTouches", f"{team_type}_PlayersOwnPenaltyAreaTouches", f"{team_type}_PlayersTouchesInDefense", 
                f"{team_type}_PlayersTouchesInMedium", f"{team_type}_PlayersTouchesInAttack", 
                f"{team_type}_PlayersAwayPenaltyAreaTouches", f"{team_type}_PlayersLiveBallTouches", 
                f"{team_type}_PlayersAttemptedDribbles", f"{team_type}_PlayersDribblesCompleted", 
                f"{team_type}_Players%DribblesCompleted", f"{team_type}_PlayersBallCarries", f"{team_type}_PlayersDistanceCarried", 
                f"{team_type}_PlayersForwardDistanceCarried", f"{team_type}_PlayersForwardCarries", 
                f"{team_type}_PlayersCarriesInAttack", f"{team_type}_PlayersAwayPenaltyAreaCarries", 
                f"{team_type}_PlayersLostControlCarries", f"{team_type}_PlayersLostCarries", f"{team_type}_PlayersPassesReception", 
                f"{team_type}_PlayersAttackPassesReception", f"{team_type}_PlayersYellowCards", f"{team_type}_PlayersRedCards", 
                f"{team_type}_PlayersSecondYellowCards", f"{team_type}_PlayersFouls", f"{team_type}_PlayersOffside", 
                f"{team_type}_PlayersPenalties", f"{team_type}_PlayersPenaltiesConceded", f"{team_type}_PlayersLostBallRecoveries", 
                f"{team_type}_PlayersAerialsWon", f"{team_type}_PlayersAerialsLost", f"{team_type}_Players%AerialsWon"
            ]

            # Rename the columns of the DataFrame
            final_table.columns = new_columns

            # Convert the 'Age' column to integer by extracting the first two characters
            final_table[f'{team_type}_PlayersAge'] = final_table[f'{team_type}_PlayersAge'].apply(
                lambda x: int(x[:2]) if isinstance(x, str) else 0
            )

            # Define columns to calculate the mean
            columns_to_mean = [
                f"{team_type}_PlayersAge", f"{team_type}_Players%CompletedPasses", 
                f"{team_type}_Players%ShortCompletedPasses", f"{team_type}_Players%MediumCompletedPasses", 
                f"{team_type}_Players%LongCompletedPasses", f"{team_type}_Players%DribblerTacklesCompleted", 
                f"{team_type}_Players%DribblesCompleted", f"{team_type}_Players%AerialsWon"
            ]

            # Define columns to calculate the sum
            columns_to_sum = [
                f"{team_type}_PlayersMinutes", f"{team_type}_PlayersGoals", f"{team_type}_PlayersShots", 
                f"{team_type}_PlayersShotsOnTarget", f"{team_type}_PlayersCompletedPasses", 
                f"{team_type}_PlayersAttemptedPasses", f"{team_type}_PlayersDistancePasses", 
                f"{team_type}_PlayersDistanceProgression", f"{team_type}_PlayersShortPasses", 
                f"{team_type}_PlayersAttemptedShortPasses", f"{team_type}_PlayersMediumPasses", 
                f"{team_type}_PlayersAttemptedMediumPasses", f"{team_type}_PlayersLongPasses", 
                f"{team_type}_PlayersAttemptedLongPasses", f"{team_type}_PlayersAssistance", 
                f"{team_type}_PlayersExpectedGoalsAssistance", f"{team_type}_PlayersExpectedAssistance", 
                f"{team_type}_PlayersKeyPasses", f"{team_type}_PlayersLast1/3Passes", f"{team_type}_PlayersGoalAreaPasses", 
                f"{team_type}_PlayersGoalAreaCrosses", f"{team_type}_PlayersGoalPasses", f"{team_type}_PlayersLiveBallPasses", 
                f"{team_type}_PlayersDeadBallPasses", f"{team_type}_PlayersFreeKick", f"{team_type}_PlayersLongPasses", 
                f"{team_type}_PlayersSidePasses", f"{team_type}_PlayersCrosses", f"{team_type}_PlayersStrongcrosses", 
                f"{team_type}_PlayersCorner", f"{team_type}_PlayersCornerIn", f"{team_type}_PlayersCornerOut", 
                f"{team_type}_PlayersCornerRect", f"{team_type}_PlayersOffsidePasses", f"{team_type}_PlayersPassesBlocked", 
                f"{team_type}_PlayersTackles", f"{team_type}_PlayersSuccessfulTackles", f"{team_type}_PlayersTacklesInDefense", 
                f"{team_type}_PlayersTacklesInMedium", f"{team_type}_PlayersTacklesInAttack", f"{team_type}_PlayersDribblerTackles", 
                f"{team_type}_PlayersAttemptedDribblerTackles", f"{team_type}_PlayersDribblerTacklesNonCompleted", 
                f"{team_type}_PlayersBallsBlocked", f"{team_type}_PlayersShotsBlocked", f"{team_type}_PlayersPassesBlocked", 
                f"{team_type}_PlayersInterceptions", f"{team_type}_PlayersTackles+Interceptions", f"{team_type}_PlayersClearances", 
                f"{team_type}_PlayersMistakesRivalShots", f"{team_type}_PlayersTouches", f"{team_type}_PlayersOwnPenaltyAreaTouches", 
                f"{team_type}_PlayersTouchesInDefense", f"{team_type}_PlayersTouchesInMedium", f"{team_type}_PlayersTouchesInAttack", 
                f"{team_type}_PlayersAwayPenaltyAreaTouches", f"{team_type}_PlayersLiveBallTouches", f"{team_type}_PlayersAttemptedDribbles", 
                f"{team_type}_PlayersDribblesCompleted", f"{team_type}_PlayersBallCarries", f"{team_type}_PlayersDistanceCarried", 
                f"{team_type}_PlayersForwardDistanceCarried", f"{team_type}_PlayersForwardCarries", f"{team_type}_PlayersCarriesInAttack", 
                f"{team_type}_PlayersAwayPenaltyAreaCarries", f"{team_type}_PlayersLostControlCarries", f"{team_type}_PlayersLostCarries", 
                f"{team_type}_PlayersPassesReception", f"{team_type}_PlayersAttackPassesReception", f"{team_type}_PlayersYellowCards", 
                f"{team_type}_PlayersRedCards", f"{team_type}_PlayersSecondYellowCards", f"{team_type}_PlayersFouls", 
                f"{team_type}_PlayersOffside", f"{team_type}_PlayersPenalties", f"{team_type}_PlayersPenaltiesConceded", 
                f"{team_type}_PlayersLostBallRecoveries", f"{team_type}_PlayersAerialsWon", f"{team_type}_PlayersAerialsLost"
            ]

            # Convert the mean columns to numeric
            for col in columns_to_mean:
                final_table[col] = final_table[col].apply(pd.to_numeric, errors='coerce')

            # Convert the sum columns to numeric
            for col in columns_to_sum:
                final_table[col] = final_table[col].apply(pd.to_numeric, errors='coerce')

            # Calculate the mean and sum for specified columns
            mean_values = final_table[columns_to_mean].mean()
            sum_values = final_table[columns_to_sum].sum()

            # Create a new row for totals with placeholder values
            total_row = {col: '-' for col in final_table.columns}

            # Populate the total row with mean values
            for col, mean in mean_values.items():
                total_row[col] = mean

            # Populate the total row with sum values
            for col, total in sum_values.items():
                total_row[col] = total

            # Add the number of rows (lines) to the first column of the total row
            num_lines = len(final_table)
            total_row[final_table.columns[0]] = num_lines

            # Check if the 'id' column exists, if not, create it with NaN values
            if 'id' not in final_table.columns:
                final_table['id'] = np.nan  # Create the column with NaN values

            # Add the match ID to the total row
            total_row['id'] = match

            # Append the total row to the DataFrame
            final_table.loc[len(final_table)] = total_row

            # Save the combined table to a CSV file
            #output_filename = f"{self.gameweek}_{match}_{team_name}_{team_type}_players_table.csv"
            #final_table.to_csv(output_filename, index=False)

            # Define the columns to append to the existing CSV
            columns_to_append = [
                f"{team_type}_Players", 
                f"{team_type}_PlayersAge", f"{team_type}_PlayersMinutes", f"{team_type}_PlayersGoals", 
                f"{team_type}_PlayersShots", f"{team_type}_PlayersShotsOnTarget", f"{team_type}_PlayersCompletedPasses", 
                f"{team_type}_PlayersAttemptedPasses", f"{team_type}_Players%CompletedPasses", 
                f"{team_type}_PlayersDistancePasses", f"{team_type}_PlayersDistanceProgression", 
                f"{team_type}_PlayersShortPasses", f"{team_type}_PlayersAttemptedShortPasses", 
                f"{team_type}_Players%ShortCompletedPasses", f"{team_type}_PlayersMediumPasses", 
                f"{team_type}_PlayersAttemptedMediumPasses", f"{team_type}_Players%MediumCompletedPasses", 
                f"{team_type}_PlayersLongPasses", f"{team_type}_PlayersAttemptedLongPasses", 
                f"{team_type}_Players%LongCompletedPasses", f"{team_type}_PlayersAssistance", 
                f"{team_type}_PlayersExpectedGoalsAssistance", f"{team_type}_PlayersExpectedAssistance", 
                f"{team_type}_PlayersKeyPasses", f"{team_type}_PlayersLast1/3Passes", f"{team_type}_PlayersGoalAreaPasses", 
                f"{team_type}_PlayersGoalAreaCrosses", f"{team_type}_PlayersGoalPasses", f"{team_type}_PlayersLiveBallPasses", 
                f"{team_type}_PlayersDeadBallPasses", f"{team_type}_PlayersFreeKick", f"{team_type}_PlayersLongPasses", 
                f"{team_type}_PlayersSidePasses", f"{team_type}_PlayersCrosses", f"{team_type}_PlayersStrongcrosses", 
                f"{team_type}_PlayersCorner", f"{team_type}_PlayersCornerIn", f"{team_type}_PlayersCornerOut", 
                f"{team_type}_PlayersCornerRect", f"{team_type}_PlayersOffsidePasses", f"{team_type}_PlayersPassesBlocked", 
                f"{team_type}_PlayersTackles", f"{team_type}_PlayersSuccessfulTackles", f"{team_type}_PlayersTacklesInDefense", 
                f"{team_type}_PlayersTacklesInMedium", f"{team_type}_PlayersTacklesInAttack", f"{team_type}_PlayersDribblerTackles", 
                f"{team_type}_PlayersAttemptedDribblerTackles", f"{team_type}_Players%DribblerTacklesCompleted", 
                f"{team_type}_PlayersDribblerTacklesNonCompleted", f"{team_type}_PlayersBallsBlocked", 
                f"{team_type}_PlayersShotsBlocked", f"{team_type}_PlayersPassesBlocked", f"{team_type}_PlayersInterceptions", 
                f"{team_type}_PlayersTackles+Interceptions", f"{team_type}_PlayersClearances", f"{team_type}_PlayersMistakesRivalShots", 
                f"{team_type}_PlayersTouches", f"{team_type}_PlayersOwnPenaltyAreaTouches", f"{team_type}_PlayersTouchesInDefense", 
                f"{team_type}_PlayersTouchesInMedium", f"{team_type}_PlayersTouchesInAttack", 
                f"{team_type}_PlayersAwayPenaltyAreaTouches", f"{team_type}_PlayersLiveBallTouches", 
                f"{team_type}_PlayersAttemptedDribbles", f"{team_type}_PlayersDribblesCompleted", 
                f"{team_type}_Players%DribblesCompleted", f"{team_type}_PlayersBallCarries", f"{team_type}_PlayersDistanceCarried", 
                f"{team_type}_PlayersForwardDistanceCarried", f"{team_type}_PlayersForwardCarries", 
                f"{team_type}_PlayersCarriesInAttack", f"{team_type}_PlayersAwayPenaltyAreaCarries", 
                f"{team_type}_PlayersLostControlCarries", f"{team_type}_PlayersLostCarries", f"{team_type}_PlayersPassesReception", 
                f"{team_type}_PlayersAttackPassesReception", f"{team_type}_PlayersYellowCards", f"{team_type}_PlayersRedCards", 
                f"{team_type}_PlayersSecondYellowCards", f"{team_type}_PlayersFouls", f"{team_type}_PlayersOffside", 
                f"{team_type}_PlayersPenalties", f"{team_type}_PlayersPenaltiesConceded", f"{team_type}_PlayersLostBallRecoveries", 
                f"{team_type}_PlayersAerialsWon", f"{team_type}_PlayersAerialsLost", f"{team_type}_Players%AerialsWon"
            ]

            # Extract the last row from final_table (this contains the sums and means)
            last_row = final_table.iloc[-1][columns_to_append]

            #Define the filename for the current season
            filename = f"matches_{season}.csv"

            # Load the existing CSV file from the parent directory
            existing_df = pd.read_csv(filename)

            # Get the ID from the last row of final_table
            last_row_id = final_table.iloc[-1]['id']

            # Find the row in the existing CSV based on the ID
            row_index = existing_df[existing_df['id'] == last_row_id].index

            # Check if the row exists
            if row_index.empty:
                raise ValueError(f"Error: No row with ID {last_row_id} found in {filename}")

            # Check if the columns exist in the existing CSV, if not, create them after the existing columns
            for column in columns_to_append:
                if column not in existing_df.columns:
                    existing_df[column] = pd.NA  # Create the column with missing values

            # Ensure the new columns are placed after the existing columns
            existing_columns = existing_df.columns.tolist()
            new_columns = [column for column in columns_to_append if column not in existing_columns]
            existing_df = existing_df[existing_columns + new_columns]

            # Update the row with the new data from last_row
            for column, value in last_row.items():
                existing_df.at[row_index[0], column] = value

            # Save the updated DataFrame back to the CSV
            existing_df.to_csv(filename, index=False)

            time.sleep(6)  # Sleep to avoid making too many requests in a short time
    
    def extract_keeper_table(self, team_type, header_offset):
        """
        Extracts a specific keeper statistics table for the given team.
        - team_type: "home" or "away".
        - header_offset: Number of header columns to skip.
        """
        # Get the team ID based on the team type (home or away)
        team_id = self.teams_data[team_type]["id"]

        # Construct the CSS selector for the specific table
        keeper_table_selector = f"#keeper_stats_{team_id}"

        # Select the table element using the constructed selector
        table = self.soup.select_one(keeper_table_selector)

        # Check if the table exists
        if not table:
            raise Exception(f"Keeper table not found for team {team_type}.")

        # Extract headers from the table, skipping the specified number of columns
        headers = [th.text.strip() for th in table.find("thead").find_all("th")][header_offset:]

        # Extract rows of data from the table body
        rows = [
            [cell.text.strip() for cell in row.find_all(["td", "th"])]
            for row in table.find("tbody").find_all("tr")
        ]

        # Create a DataFrame from the extracted rows and headers
        df = pd.DataFrame(rows, columns=headers)

        return df

        # Add a delay to avoid overloading the server
        time.sleep(6)

    def process_keeper_data(self, team_type):
        """
        Processes all keeper statistics tables for a specific team (home or away).
        Combines data into a dictionary of DataFrames.
        """
        # Extract data from the keeper statistics table
        table = self.extract_keeper_table(
                team_type, header_offset=7)

        return table

    def save_keeper_tables(self, match, season):
        """
        Processes and saves keeper statistics table for both home and away teams.
        Writes the final table to a CSV file.
        """
        # Extract IDs and names of the teams
        self.extract_teams_ids()

        # Process data for both home and away teams
        for team_type in ["home", "away"]:
            # Get the team name
            team_name = self.teams_data[team_type]["name"]

            # Extract and process all keeper statistics tables for the team
            final_table = self.process_keeper_data(team_type)

            # Define the new column names for the dataset
            new_columns = [
                f"{team_type}_KeepersKeepers", f"{team_type}_KeepersNationality", f"{team_type}_KeepersAge", 
                f"{team_type}_KeepersMinutes", f"{team_type}_KeepersShotsOnTargetAgainst", 
                f"{team_type}_KeepersGoalsAgainst", f"{team_type}_KeepersSaved", 
                f"{team_type}_Keepers%Saved", f"{team_type}_KeepersxG", f"{team_type}_KeepersPassesLaunched", 
                f"{team_type}_KeepersAttemptedPassesLaunched", f"{team_type}_Keepers%CompletedPassesLaunched", 
                f"{team_type}_KeepersPasses", f"{team_type}_KeepersAttemptedPasses", 
                f"{team_type}_Keepers%CompletedPasses", f"{team_type}_KeepersPassesDistance", 
                f"{team_type}_KeepersAttemptedKicks", f"{team_type}_Keepers%Kicks", 
                f"{team_type}_KeepersKicksDistance", f"{team_type}_KeepersCrosses", 
                f"{team_type}_KeepersCrossesStopped", f"{team_type}_Keepers%CrossesStopped", 
                f"{team_type}_KeepersActionsOutsideArea", f"{team_type}_KeepersDistanceActionsArea"
            ]

            # Rename the columns of the DataFrame
            final_table.columns = new_columns

            # Convert the 'Age' column to integer by extracting the first two characters
            final_table[f'{team_type}_KeepersAge'] = final_table[f'{team_type}_KeepersAge'].apply(
                lambda x: int(x[:2]) if isinstance(x, str) else 0
            )

            # Define columns to calculate the mean
            columns_to_mean = [
                f"{team_type}_KeepersAge", f"{team_type}_Keepers%Saved", 
                f"{team_type}_Keepers%CompletedPassesLaunched", f"{team_type}_Keepers%CompletedPasses", 
                f"{team_type}_KeepersPassesDistance", f"{team_type}_Keepers%Kicks", 
                f"{team_type}_KeepersKicksDistance", f"{team_type}_Keepers%CrossesStopped", 
                f"{team_type}_KeepersDistanceActionsArea"
            ]

            # Define columns to calculate the sum
            columns_to_sum = [
                f"{team_type}_KeepersKeepers", f"{team_type}_KeepersMinutes", 
                f"{team_type}_KeepersShotsOnTargetAgainst", f"{team_type}_KeepersGoalsAgainst", 
                f"{team_type}_KeepersSaved", f"{team_type}_KeepersxG", f"{team_type}_KeepersPassesLaunched", 
                f"{team_type}_KeepersAttemptedPassesLaunched", f"{team_type}_KeepersPasses", 
                f"{team_type}_KeepersAttemptedPasses", f"{team_type}_KeepersAttemptedKicks", 
                f"{team_type}_KeepersCrosses", f"{team_type}_KeepersCrossesStopped", 
                f"{team_type}_KeepersActionsOutsideArea"
            ]

            # Convert the mean columns to numeric
            for col in columns_to_mean:
                final_table[col] = final_table[col].apply(pd.to_numeric, errors='coerce')

            # Convert the sum columns to numeric
            for col in columns_to_sum:
                final_table[col] = final_table[col].apply(pd.to_numeric, errors='coerce')

            # Calculate the mean and sum for specified columns
            mean_values = final_table[columns_to_mean].mean()
            sum_values = final_table[columns_to_sum].sum()

            # Create a new row for totals with placeholder values
            total_row = {col: '-' for col in final_table.columns}

            # Populate the total row with mean values
            for col, mean in mean_values.items():
                total_row[col] = mean

            # Populate the total row with sum values
            for col, total in sum_values.items():
                total_row[col] = total

            # Add the number of rows (lines) to the first column of the total row
            num_lines = len(final_table)
            total_row[final_table.columns[0]] = num_lines

            # Check if the 'id' column exists, if not, create it with NaN values
            if 'id' not in final_table.columns:
                final_table['id'] = np.nan  # Create the column with NaN values

            # Add the match ID to the total row
            total_row['id'] = match

            # Append the total row to the DataFrame
            final_table.loc[len(final_table)] = total_row

            # Save the combined table to a CSV file
            #output_filename = f"{self.gameweek}_{match}_{team_name}_{team_type}_keeper_table.csv"
            #final_table.to_csv(output_filename, index=False)

            # Define the columns to append to the existing CSV
            columns_to_append = [
                f"{team_type}_KeepersKeepers", f"{team_type}_KeepersMinutes",
                f"{team_type}_KeepersShotsOnTargetAgainst", f"{team_type}_KeepersGoalsAgainst", f"{team_type}_KeepersSaved", 
                f"{team_type}_Keepers%Saved", f"{team_type}_KeepersxG", f"{team_type}_KeepersPassesLaunched", 
                f"{team_type}_KeepersAttemptedPassesLaunched", f"{team_type}_Keepers%CompletedPassesLaunched", 
                f"{team_type}_KeepersPasses", f"{team_type}_KeepersAttemptedPasses", f"{team_type}_Keepers%CompletedPasses", 
                f"{team_type}_KeepersPassesDistance", f"{team_type}_KeepersAttemptedKicks", f"{team_type}_Keepers%Kicks", 
                f"{team_type}_KeepersKicksDistance", f"{team_type}_KeepersCrosses", f"{team_type}_KeepersCrossesStopped", 
                f"{team_type}_Keepers%CrossesStopped", f"{team_type}_KeepersActionsOutsideArea", f"{team_type}_KeepersDistanceActionsArea"
            ]

            # Extract the last row from final_table (this contains the sums and means)
            last_row = final_table.iloc[-1][columns_to_append]

            #Define the filename for the current season
            filename = f"matches_{season}.csv"

            # Load the existing CSV file from the parent directory
            existing_df = pd.read_csv(filename)

            # Get the ID from the last row of final_table
            last_row_id = final_table.iloc[-1]['id']

            # Find the row in the existing CSV based on the ID
            row_index = existing_df[existing_df['id'] == last_row_id].index

            # Check if the row exists
            if row_index.empty:
                raise ValueError(f"Error: No row with ID {last_row_id} found in {filename}")

            # Check if the columns exist in the existing CSV, if not, create them after the existing columns
            for column in columns_to_append:
                if column not in existing_df.columns:
                    existing_df[column] = pd.NA  # Create the column with missing values

            # Ensure the new columns are placed after the existing columns
            existing_columns = existing_df.columns.tolist()
            new_columns = [column for column in columns_to_append if column not in existing_columns]
            existing_df = existing_df[existing_columns + new_columns]

            # Update the row with the new data from last_row
            for column, value in last_row.items():
                existing_df.at[row_index[0], column] = value

            # Save the updated DataFrame back to the CSV
            existing_df.to_csv(filename, index=False)

            time.sleep(6)  # Sleep to avoid making too many requests in a short time

    def run(self, url, links_file, gameweeks_file):
        """
        Processes multiple gameweek URLs by reading from a file of links and gameweeks.
        For each URL, extracts and saves player and keeper statistics tables.
        """
        print(f"Starting collecting players data...")

        #Extract the season from the URL
        season_match = re.search(r'(\d{4})-(\d{4})', url)
        if season_match:
            season = season_match.group(0)
        else:
            season = 'unknown_season'  # Default value if the season cannot be extracted
            
        # Read the links and gameweeks from CSV files
        links_df = pd.read_csv(links_file)
        gameweeks_df = pd.read_csv(gameweeks_file)

        # Initialize match as an integer
        match = 1  # Start match numbering from 100

        # Loop through each link and its corresponding gameweek starting from index 99 (100th link)
        for index in range(0, len(links_df)):  # Adjust starting index to 99 (Python indexing starts at 0)
            link = links_df.iloc[index]['link']
            gameweek = gameweeks_df.iloc[index]['gameweek']

            print(f"Processing link {index + 1}: {link}")

            # Create an extractor for the current URL and gameweek
            extractor = Players_data(link, gameweek)

            # Save the extracted players tables for the current URL
            extractor.save_players_tables(match, season)

            # Save the extracted keepers tables for the current URL
            extractor.save_keeper_tables(match, season)

            # Increment the match counter
            match += 1

            time.sleep(6)

        print(f"Collecting players data process completed successfully!")

### Obtaining time data from the match

In [35]:
class Match_events:
    def __init__(self, url):
        """
        Initialize the extractor with the URL of the match page.
        This sets up the URL, initializes placeholders for parsed HTML content,
        and lists to store events for both teams.
        """
        self.url = url  # URL of the match page
        self.soup = None  # Placeholder for the parsed HTML content
        self.events_team_a = []  # List to store events for Team A
        self.events_team_b = []  # List to store events for Team B

    def fetch_html(self):
        """
        Fetch the HTML content from the given URL.
        This method makes an HTTP GET request to the URL and parses the HTML if the request is successful.
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",  # Indica el idioma preferido
            "Accept-Encoding": "gzip, deflate, br",  # Indica que aceptas respuestas comprimidas
            "Connection": "keep-alive",  # Mantiene la conexión abierta para mayor eficiencia
            "Upgrade-Insecure-Requests": "1",  # Indica que el cliente prefiere HTTPS
            "DNT": "1"  # Indica que no deseas ser rastreado (opcional)
        }
        response = requests.get(self.url, headers=headers)
        if response.status_code == 200:
            self.soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content
        else:
            raise Exception(f"Failed to fetch HTML content. Status code: {response.status_code}")

    def parse_events(self):
        """
        Extract match events from the HTML content for both teams.
        This method locates the event container in the HTML and extracts relevant data for both teams.
        """
        # Ensure that the HTML content has been loaded
        if not self.soup:
            raise Exception("HTML content not loaded. Call 'fetch_html()' first.")
        
        # Locate the main container that holds all the events
        events_wrap = self.soup.find('div', id='events_wrap')
        if not events_wrap:
            raise Exception("Event container not found in the HTML.")

        # Parse events for Team A
        for event in events_wrap.find_all('div', class_='event a'):  # Look for events with class 'event a'
            minute = event.find('small').text.strip() if event.find('small') else None  # Get the minute of the event
            event_icon = event.find('div', class_='event_icon')
            event_type = event_icon['class'][1] if event_icon else None  # Get the type of event (e.g., goal, yellow card)
            player_tag = event.find('a')
            player = player_tag.text.strip() if player_tag else None  # Get the player involved in the event
            team_logo = event.find('img', class_='teamlogo')
            team = team_logo['alt'].replace(" Club Crest", "").replace(" ", "_") if team_logo else None  # Get the team name

            # Append the extracted details to the Team A events list
            self.events_team_a.append({
                'Minute': minute,
                'EventType': event_type,
                'Player': player,
                'Team': team
            })

        # Parse events for Team B
        for event in events_wrap.find_all('div', class_='event b'):  # Look for events with class 'event b'
            minute = event.find('small').text.strip() if event.find('small') else None  # Get the minute of the event
            event_icon = event.find('div', class_='event_icon')
            event_type = event_icon['class'][1] if event_icon else None  # Get the type of event (e.g., goal, yellow card)
            player_tag = event.find('a')
            player = player_tag.text.strip() if player_tag else None  # Get the player involved in the event
            team_logo = event.find('img', class_='teamlogo')
            team = team_logo['alt'].replace(" Club Crest", "").replace(" ", "_") if team_logo else None  # Get the team name

            # Append the extracted details to the Team B events list
            self.events_team_b.append({
                'Minute': minute,
                'EventType': event_type,
                'Player': player,
                'Team': team
            })

    def save_to_csv(self, match, gameweek):
        """
        Save the extracted events for both Team A and Team B to separate CSV files.
        This method ensures that events are saved only after they have been parsed.
        """
        if not self.events_team_a:
            raise Exception("No events for Team A to save. Make sure to call 'parse_events()' first.")
        if not self.events_team_b:
            raise Exception("No events for Team B to save. Make sure to call 'parse_events()' first.")
        
        # Convert the events for Team A and Team B to DataFrames
        events_team_a_df = pd.DataFrame(self.events_team_a)
        events_team_b_df = pd.DataFrame(self.events_team_b)

        # Get the team names for the output filenames
        team_name_a = self.events_team_a[0]['Team']
        team_name_b = self.events_team_b[0]['Team']

        # Define output filenames for the CSV files
        output_filename_a = f"{gameweek}_{match}_{team_name_a}_events.csv"
        output_filename_b = f"{gameweek}_{match}_{team_name_b}_events.csv"
        
        # Save the extracted events to CSV files for both teams
        events_team_a_df.to_csv(output_filename_a, index=False)
        events_team_b_df.to_csv(output_filename_b, index=False)

    def run(self, links_file, gameweeks_file):
        """
        Execute the full process: fetch HTML, parse events, and save to CSV files for both teams.
        This method orchestrates the entire extraction process by reading the necessary input files,
        fetching the HTML content, parsing events, and saving the results to CSV files.
        """
        print(f"Starting collecting events data...")

        # Read the links and gameweeks from CSV files
        links_df = pd.read_csv(links_file)
        gameweeks_df = pd.read_csv(gameweeks_file)

        # Initialize match as an integer
        match = 1

        # Loop through each link and its corresponding gameweek
        for index, link in enumerate(links_df['link']):
            gameweek = gameweeks_df.iloc[index]['gameweek']

            print(f"Processing link {index + 1}: {link}")

            # Create an extractor for the current URL and gameweek
            extractor = Match_events(link)

            # Fetch the HTML content for the match page
            extractor.fetch_html()

            # Parse the events for both teams
            extractor.parse_events()

            # Save the extracted events to CSV files for both teams
            extractor.save_to_csv(match, gameweek)

            # Increment the match counter
            match += 1

            time.sleep(6)

        print(f"Collecting events data process completed successfully!")

### Full process

In [36]:
def run_full_process():
    """
    Runs the full data extraction process for a given league URL. 
    It involves extracting match data, player statistics, and match events.
    """
    # Define the league URL (La Liga for the 2023-2024 season)
    league_url = 'https://fbref.com/es/comps/12/2022-2023/horario/Marcadores-y-partidos-de-2022-2023-La-Liga'
    
    # First class: Extract match data from the league URL
    # Create an instance of Match_data and run the extraction process
    Match_data_extractor = Match_data(league_url)
    Match_data_extractor.run(league_url)
    
    # Second class: Extract player data for each match
    # Extract the season from the URL using a regular expression
    season_match = re.search(r'(\d{4})-(\d{4})', league_url)
    if season_match:
        season = season_match.group(0)  # Get the season
    else:
        season = 'unknown_season'  # Default value if the season cannot be extracted

    #Define the filename for the current season
    filename = f"matches_{season}.csv"

    # Load the existing CSV file from the parent directory
    data = pd.read_csv(filename)
    
    # Define file paths for temporary CSV files containing links and gameweeks
    links_file = 'links_temp.csv'
    gameweeks_file = 'gameweeks_temp.csv'

    # Save only the 'link' column from the match data to the links file
    data[['link']].to_csv(links_file, index=False)
    
    # Save only the 'gameweek' column from the match data to the gameweeks file
    data[['gameweek']].to_csv(gameweeks_file, index=False)

    # Call the function to extract player data using the links and gameweeks files
    Players_data_extractor = Players_data(league_url, None)
    Players_data_extractor.run(league_url, links_file, gameweeks_file)
    
    # Third class: Extract match events data
    # Create an instance of Match_events and run the extraction process
    #Match_events_extractor = Match_events(league_url)
    #Match_events_extractor.run(links_file, filename)

    # Clean up by removing the temporary CSV files after the process is complete
    os.remove(links_file)
    os.remove(gameweeks_file)

In [None]:
run_full_process()

Starting collecting players data...
Processing link 1: https://fbref.com/es/partidos/1edcbf7a/Osasuna-Sevilla-Agosto-12-2022-La-Liga
Processing link 2: https://fbref.com/es/partidos/73b529f2/Celta-Vigo-Espanyol-Agosto-13-2022-La-Liga
Processing link 3: https://fbref.com/es/partidos/be7e4222/Valladolid-Villarreal-Agosto-13-2022-La-Liga
Processing link 4: https://fbref.com/es/partidos/cbe0a303/Barcelona-Rayo-Vallecano-Agosto-13-2022-La-Liga
Processing link 5: https://fbref.com/es/partidos/7fbde755/Cadiz-Real-Sociedad-Agosto-14-2022-La-Liga
Processing link 6: https://fbref.com/es/partidos/aaab7ab9/Valencia-Girona-Agosto-14-2022-La-Liga
Processing link 7: https://fbref.com/es/partidos/c1e42359/Almeria-Real-Madrid-Agosto-14-2022-La-Liga
Processing link 8: https://fbref.com/es/partidos/ab058eb9/Athletic-Club-Mallorca-Agosto-15-2022-La-Liga
Processing link 9: https://fbref.com/es/partidos/9993b01a/Getafe-Atletico-Madrid-Agosto-15-2022-La-Liga
Processing link 10: https://fbref.com/es/partidos/

#### Code to see web situation

In [None]:
'''# URL que estás intentando acceder
url = "https://fbref.com/es/partidos/33737218/Almeria-Real-Madrid-Agosto-19-2023-La-Liga"

# Número máximo de intentos
max_retries = 5

# Tiempo de espera entre intentos (en segundos)
wait_time = 10

# Intentar acceder a la página
for attempt in range(max_retries):
    response = requests.get(url)
    
    # Si la respuesta es 403, esperar antes de reintentar
    if response.status_code == 403:
        print(f"Acceso prohibido. Esperando {wait_time} segundos antes de reintentar...")
        time.sleep(wait_time)
    else:
        print(f"Respuesta recibida: {response.status_code}")
        break'''

Respuesta recibida: 200


#### Code to input data locally

In [27]:
'''# Leer el archivo CSV
df = pd.read_csv('matches_2023-2024.csv')

# Especifica el valor de la columna 'id' que quieres buscar
target_id = 351

# Encuentra la fila correspondiente al valor de 'id'
row_index = df[df['id'] == target_id].index

if not row_index.empty:
    # Toma el índice de la fila encontrada
    row_index = row_index[0]

    # Especifica las columnas que quieres modificar y los nuevos valores
    columns_to_modify = [
        'home_Players', 'home_PlayersAge', 'home_PlayersMinutes', 'home_PlayersGoals', 'home_PlayersShots', 
        'home_PlayersShotsOnTarget', 'home_PlayersCompletedPasses', 'home_PlayersAttemptedPasses', 
        'home_Players%CompletedPasses', 'home_PlayersDistancePasses', 'home_PlayersDistanceProgression', 
        'home_PlayersShortPasses', 'home_PlayersAttemptedShortPasses', 'home_Players%ShortCompletedPasses', 
        'home_PlayersMediumPasses', 'home_PlayersAttemptedMediumPasses', 'home_Players%MediumCompletedPasses', 
        'home_PlayersLongPasses', 'home_PlayersAttemptedLongPasses', 'home_Players%LongCompletedPasses', 
        'home_PlayersAssistance', 'home_PlayersExpectedGoalsAssistance', 'home_PlayersExpectedAssistance', 
        'home_PlayersKeyPasses', 'home_PlayersLast1/3Passes', 'home_PlayersGoalAreaPasses', 'home_PlayersGoalAreaCrosses', 
        'home_PlayersGoalPasses', 'home_PlayersLiveBallPasses', 'home_PlayersDeadBallPasses', 'home_PlayersFreeKick', 
        'home_PlayersSidePasses', 'home_PlayersCrosses', 'home_PlayersStrongcrosses', 'home_PlayersCorner', 
        'home_PlayersCornerIn', 'home_PlayersCornerOut', 'home_PlayersCornerRect', 'home_PlayersOffsidePasses', 
        'home_PlayersPassesBlocked', 'home_PlayersTackles', 'home_PlayersSuccessfulTackles', 'home_PlayersTacklesInDefense', 
        'home_PlayersTacklesInMedium', 'home_PlayersTacklesInAttack', 'home_PlayersDribblerTackles', 
        'home_PlayersAttemptedDribblerTackles', 'home_Players%DribblerTacklesCompleted', 'home_PlayersDribblerTacklesNonCompleted', 
        'home_PlayersBallsBlocked', 'home_PlayersShotsBlocked', 'home_PlayersInterceptions', 'home_PlayersTackles+Interceptions', 
        'home_PlayersClearances', 'home_PlayersMistakesRivalShots', 'home_PlayersTouches', 'home_PlayersOwnPenaltyAreaTouches', 
        'home_PlayersTouchesInDefense', 'home_PlayersTouchesInMedium', 'home_PlayersTouchesInAttack', 
        'home_PlayersAwayPenaltyAreaTouches', 'home_PlayersLiveBallTouches', 'home_PlayersAttemptedDribbles', 
        'home_PlayersDribblesCompleted', 'home_Players%DribblesCompleted', 'home_PlayersBallCarries', 
        'home_PlayersDistanceCarried', 'home_PlayersForwardDistanceCarried', 'home_PlayersForwardCarries', 
        'home_PlayersCarriesInAttack', 'home_PlayersAwayPenaltyAreaCarries', 'home_PlayersLostControlCarries', 
        'home_PlayersLostCarries', 'home_PlayersPassesReception', 'home_PlayersAttackPassesReception', 
        'home_PlayersYellowCards', 'home_PlayersRedCards', 'home_PlayersSecondYellowCards', 'home_PlayersFouls', 
        'home_PlayersOffside', 'home_PlayersPenalties', 'home_PlayersPenaltiesConceded', 'home_PlayersLostBallRecoveries', 
        'home_PlayersAerialsWon', 'home_PlayersAerialsLost', 'home_Players%AerialsWon', 'away_Players', 'away_PlayersAge', 
        'away_PlayersMinutes', 'away_PlayersGoals', 'away_PlayersShots', 'away_PlayersShotsOnTarget', 
        'away_PlayersCompletedPasses', 'away_PlayersAttemptedPasses', 'away_Players%CompletedPasses', 
        'away_PlayersDistancePasses', 'away_PlayersDistanceProgression', 'away_PlayersShortPasses', 
        'away_PlayersAttemptedShortPasses', 'away_Players%ShortCompletedPasses', 'away_PlayersMediumPasses', 
        'away_PlayersAttemptedMediumPasses', 'away_Players%MediumCompletedPasses', 'away_PlayersLongPasses', 
        'away_PlayersAttemptedLongPasses', 'away_Players%LongCompletedPasses', 'away_PlayersAssistance', 
        'away_PlayersExpectedGoalsAssistance', 'away_PlayersExpectedAssistance', 'away_PlayersKeyPasses', 
        'away_PlayersLast1/3Passes', 'away_PlayersGoalAreaPasses', 'away_PlayersGoalAreaCrosses', 'away_PlayersGoalPasses', 
        'away_PlayersLiveBallPasses', 'away_PlayersDeadBallPasses', 'away_PlayersFreeKick', 'away_PlayersSidePasses', 
        'away_PlayersCrosses', 'away_PlayersStrongcrosses', 'away_PlayersCorner', 'away_PlayersCornerIn', 
        'away_PlayersCornerOut', 'away_PlayersCornerRect', 'away_PlayersOffsidePasses', 'away_PlayersPassesBlocked', 
        'away_PlayersTackles', 'away_PlayersSuccessfulTackles', 'away_PlayersTacklesInDefense', 'away_PlayersTacklesInMedium', 
        'away_PlayersTacklesInAttack', 'away_PlayersDribblerTackles', 'away_PlayersAttemptedDribblerTackles', 
        'away_Players%DribblerTacklesCompleted', 'away_PlayersDribblerTacklesNonCompleted', 'away_PlayersBallsBlocked', 
        'away_PlayersShotsBlocked', 'away_PlayersInterceptions', 'away_PlayersTackles+Interceptions', 'away_PlayersClearances', 
        'away_PlayersMistakesRivalShots', 'away_PlayersTouches', 'away_PlayersOwnPenaltyAreaTouches', 
        'away_PlayersTouchesInDefense', 'away_PlayersTouchesInMedium', 'away_PlayersTouchesInAttack', 
        'away_PlayersAwayPenaltyAreaTouches', 'away_PlayersLiveBallTouches', 'away_PlayersAttemptedDribbles', 
        'away_PlayersDribblesCompleted', 'away_Players%DribblesCompleted', 'away_PlayersBallCarries', 
        'away_PlayersDistanceCarried', 'away_PlayersForwardDistanceCarried', 'away_PlayersForwardCarries', 
        'away_PlayersCarriesInAttack', 'away_PlayersAwayPenaltyAreaCarries', 'away_PlayersLostControlCarries', 
        'away_PlayersLostCarries', 'away_PlayersPassesReception', 'away_PlayersAttackPassesReception', 'away_PlayersYellowCards', 
        'away_PlayersRedCards', 'away_PlayersSecondYellowCards', 'away_PlayersFouls', 'away_PlayersOffside', 'away_PlayersPenalties', 
        'away_PlayersPenaltiesConceded', 'away_PlayersLostBallRecoveries', 'away_PlayersAerialsWon', 'away_PlayersAerialsLost', 
        'away_Players%AerialsWon'
    ]

    new_values = [
        '16.0','27.188','990.0','1.0','10.0','4.0','371.0','506.0','73.3','6341.0','2310.0','187.0','217.0','86.2','141.0','181.0','77.9','36.0','80.0','45.0','0.0','0.2','0.5','4.0','24.0','6.0','1.0','27.0','445.0','60.0','13.0','5.0','14.0','37.0','2.0','1.0','0.0','0.0','1.0','6.0','16.0','9.0','9.0','4.0','3.0','7.0','11.0','63.6','4.0','8.0','2.0','6.0','24.0','16.0','0.0','606.0','41.0','149.0','301.0','158.0','18.0','606.0','10.0','4.0','40.0','369.0','1829.0','683.0','14.0','13.0','2.0','13.0','14.0','362.0','27.0','0.0','0.0','0.0','12.0','9.0','0.0','0.0','46.0','22.0','17.0','56.4',
        '16.0','29.125','990.0','1.0','10.0','4.0','337.0','458.0','73.6','5876.0','2229.0','163.0','189.0','86.2','137.0','166.0','82.5','30.0','83.0','36.1','0.0','0.3','0.3','5.0','18.0','6.0','3.0','33.0','405.0','50.0','12.0','1.0','16.0','25.0','4.0','2.0','2.0','0.0','3.0','6.0','19.0','11.0','9.0','9.0','1.0','5.0','9.0','55.6','4.0','6.0','1.0','5.0','28.0','39.0','0.0','584.0','87.0','234.0','254.0','97.0','16.0','584.0','12.0','4.0','33.33','314.0','1666.0','650.0','8.0','10.0','0.0','16.0','9.0','337.0','13.0','1.0','0.0','0.0','11.0','11.0','0.0','0.0','42.0','17.0','22.0','43.6'

    ]  # Nuevos valores

    # Modificar los valores en la fila especificada
    for col, new_val in zip(columns_to_modify, new_values):
        df.at[row_index, col] = new_val

    # Guardar los cambios en el archivo CSV
    df.to_csv('matches_2023-2024.csv', index=False)

    print(f"Updated row with id {target_id} in columns {columns_to_modify} with values {new_values}")
else:
    print(f"No row found with id {target_id}")'''

Updated row with id 351 in columns ['home_Players', 'home_PlayersAge', 'home_PlayersMinutes', 'home_PlayersGoals', 'home_PlayersShots', 'home_PlayersShotsOnTarget', 'home_PlayersCompletedPasses', 'home_PlayersAttemptedPasses', 'home_Players%CompletedPasses', 'home_PlayersDistancePasses', 'home_PlayersDistanceProgression', 'home_PlayersShortPasses', 'home_PlayersAttemptedShortPasses', 'home_Players%ShortCompletedPasses', 'home_PlayersMediumPasses', 'home_PlayersAttemptedMediumPasses', 'home_Players%MediumCompletedPasses', 'home_PlayersLongPasses', 'home_PlayersAttemptedLongPasses', 'home_Players%LongCompletedPasses', 'home_PlayersAssistance', 'home_PlayersExpectedGoalsAssistance', 'home_PlayersExpectedAssistance', 'home_PlayersKeyPasses', 'home_PlayersLast1/3Passes', 'home_PlayersGoalAreaPasses', 'home_PlayersGoalAreaCrosses', 'home_PlayersGoalPasses', 'home_PlayersLiveBallPasses', 'home_PlayersDeadBallPasses', 'home_PlayersFreeKick', 'home_PlayersSidePasses', 'home_PlayersCrosses', 'ho

  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_index, col] = new_val
  df.at[row_in