# Automatizing new matches downloads

In [104]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import re
import os
import numpy as np
from unidecode import unidecode

### Obtaining match data from the website

In [105]:
class Match_data:
    def __init__(self, url, max):
        """
        Initialize the class with the given URL and optional limits for the number of links, gameweeks, and scores.

        Args:
            url (str): The URL to scrape match data from.
            max (int or None): Maximum number of match links, gameweeks, and scores to scrape. If None, no limit is applied.
        """
        self.url = url  # Store the URL as an attribute of the class
        self.max_links = max  # Limit for match links
        self.max_gameweeks = max  # Limit for gameweeks
        self.max_scores = max  # Limit for scores
        self.links = []  # List to store match links
        self.gameweeks = []  # List to store gameweek data
        self.scores = []  # List to store match scores (home and away goals)
        self.result = []  # List of results
        self.previous_gameweek = None  # Variable to store the last valid gameweek

    def get_match_data(self):
        """
        Get match report links, gameweek data, result and match scores (home and away goals) from the provided URL.

        This function scrapes the provided URL for match report links, gameweek data, result and match scores,
        and stores them in the respective lists.

        Returns:
            pd.DataFrame, pd.DataFrame, pd.DataFrame: Three DataFrames containing match report links, gameweek data, scores and result.
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",  # Indicates preferred language
            "Accept-Encoding": "gzip, deflate, br",  # Indicates that compressed responses are accepted
            "Connection": "keep-alive",  # Keeps the connection open for better efficiency
            "Upgrade-Insecure-Requests": "1",  # Indicates that the client prefers HTTPS
            "DNT": "1",  # Indicates that tracking is not desired (optional)
        }

        # Send HTTP request to the URL
        response = requests.get(self.url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")  # Parse the HTML content

        # Get match report links
        report_cells = soup.find_all("td", {"data-stat": "match_report"})
        for cell in report_cells:
            if len(self.links) >= self.max_links:
                break  # Stop once we reach the limit
            link = cell.find("a")  # Find the <a> tag in the cell
            if link:
                url = link["href"]  # Extract the URL from the link
                self.links.append(
                    f"https://fbref.com{url}"
                )  # Append the full URL to the links list
            time.sleep(6)  # Sleep to avoid making too many requests in a short time

        # Get gameweek data
        gameweek_cells = soup.find_all("th", {"data-stat": "gameweek"})
        for cell in gameweek_cells:
            if len(self.gameweeks) >= self.max_gameweeks:
                break  # Stop once we reach the limit
            gameweek_value = (
                cell.text.strip() if cell else "N/A"
            )  # Extract and clean the gameweek value
            if gameweek_value in ["Sem.", ""]:  # Skip 'Sem.' or empty values
                continue

            if gameweek_value == "N/A" and self.previous_gameweek:
                # If no gameweek found, use the previous gameweek
                gameweek_value = self.previous_gameweek

            try:
                # Convert gameweek_value to an integer
                gameweek_value = int(gameweek_value)
            except ValueError:
                gameweek_value = -1  # Use -1 if the gameweek is not a valid integer

            self.gameweeks.append(
                gameweek_value
            )  # Append the value to the gameweeks list
            self.previous_gameweek = gameweek_value  # Update the previous gameweek
            time.sleep(6)  # Sleep to avoid making too many requests in a short time

        # Get match scores (home and away goals)
        score_cells = soup.find_all("td", {"data-stat": "score"})
        for cell in score_cells:
            if len(self.scores) >= self.max_scores:
                break  # Stop once we reach the limit
            score_text = cell.text.strip() if cell else ""
            if score_text:
                # Extract home and away goals
                score_parts = score_text.split("–")
                if len(score_parts) == 2:
                    try:
                        home_goals = int(score_parts[0].strip())
                        away_goals = int(score_parts[1].strip())
                        self.scores.append(
                            (home_goals, away_goals)
                        )  # Append the tuple of scores

                        # Determine the result and append it to self.result
                        if home_goals > away_goals:
                            result = 1  # Home team wins
                        elif home_goals < away_goals:
                            result = -1  # Away team wins
                        else:
                            result = 0  # Draw

                        self.result.append(result)  # Add the result to the result list

                    except ValueError:
                        self.scores.append(
                            (0, 0)
                        )  # Use (0, 0) if there is a problem parsing the scores

            time.sleep(6)  # Sleep to avoid making too many requests in a short time

        # Return match report links, gameweek data, scores, and results
        return self.links, self.gameweeks, self.scores, self.result

    def create_matches_csv(self):
        """
        Create a CSV file with match data, combining gameweek, match information, and goals.

        This function combines match details (e.g., teams, date, time), gameweek data, and goals into a single DataFrame.

        Returns:
            pd.DataFrame: DataFrame containing match details, gameweek data, and goals.
        """
        # Create a DataFrame with empty values for the match data
        df = pd.DataFrame(
            {
                "id": pd.Series(
                    [None] * len(self.links)
                ),  # Create a new column with unique IDs for each match
                "date_of_match": pd.Series([None] * len(self.links)),
                "hour_of_the_match": pd.Series([None] * len(self.links)),
                "day_of_week": pd.Series([None] * len(self.links)),
                "day_of_year": pd.Series([None] * len(self.links)),
                "hour_of_day": pd.Series([None] * len(self.links)),
                "home_team_name": pd.Series([None] * len(self.links), dtype="str"),
                "away_team_name": pd.Series([None] * len(self.links), dtype="str"),
                "home_trainer": pd.Series([None] * len(self.links), dtype="str"),
                "away_trainer": pd.Series([None] * len(self.links), dtype="str"),
                "stadium": pd.Series([None] * len(self.links), dtype="str"),
                "attendance": pd.Series([None] * len(self.links), dtype="str"),
                "capacity": pd.Series([None] * len(self.links), dtype="int"),
                "attendance%": pd.Series([None] * len(self.links), dtype="float"),
                "referee": pd.Series([None] * len(self.links), dtype="str"),
                "var": pd.Series([None] * len(self.links), dtype="str"),
                "home_team_lineup": pd.Series([None] * len(self.links), dtype="str"),
                "away_team_lineup": pd.Series([None] * len(self.links), dtype="str"),
                "home_possession": pd.Series([None] * len(self.links), dtype="float"),
                "away_possession": pd.Series([None] * len(self.links), dtype="float"),
            }
        )

        # Ensure that the 'date_of_match' column is converted to datetime format
        df["date_of_match"] = pd.to_datetime(
            df["date_of_match"], format="%Y-%m-%d", errors="coerce"
        )

        # Ensure that the 'hour_of_the_match' column is converted to datetime format
        df["hour_of_the_match"] = pd.to_datetime(
            df["hour_of_the_match"], format="%H:%M", errors="coerce"
        ).dt.time

        # Get the match data (links, gameweeks, and goals)
        links, gameweeks, scores, result = self.get_match_data()

        # Create DataFrames from the gameweek, links, and scores data
        df_gameweeks = pd.DataFrame(gameweeks, columns=["gameweek"])
        df_links = pd.DataFrame(links, columns=["link"])
        df_scores = pd.DataFrame(scores, columns=["home_team_goals", "away_team_goals"])
        df_result = pd.DataFrame(result, columns=["result"])

        # Combine all DataFrames into one final DataFrame
        df_final = pd.concat([df_gameweeks, df], axis=1)
        df_final = pd.concat([df_final, df_links], axis=1)
        df_final = pd.concat([df_final, df_scores], axis=1)
        df_final = pd.concat([df_final, df_result], axis=1)

        # Assign the final DataFrame to the class attribute with the new name
        self.df_final = df_final

        return self.df_final  # Return the DataFrame to be used later

    def get_statistics(self):
        """
        Get match statistics from the links and save them to a CSV file.

        This function extracts detailed statistics (e.g., team lineups, referee, attendance) from the match links.

        Returns:
            pd.DataFrame: DataFrame containing match statistics.
        """
        links = self.df_final["link"].tolist()
        id = 0

        for idx, link in enumerate(links):
            try:
                print(f"Processing link {idx + 1}: {link}")
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.5",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Connection": "keep-alive",
                    "Upgrade-Insecure-Requests": "1",
                    "DNT": "1",
                }
                response = requests.get(link, headers=headers)
                soup = BeautifulSoup(response.text, "html.parser")

                # Assign a unique ID for each match
                id += 1
                self.df_final.at[idx, "id"] = id

                # Extract match date and time
                date_element = soup.find("span", {"class": "venuetime"})
                if date_element:
                    match_date = date_element.get("data-venue-date")
                    match_date = pd.to_datetime(match_date, errors="coerce")  # Convert to datetime
                    day_of_week = match_date.dayofweek  # Day of the week
                    day_of_year = match_date.dayofyear  # Day of the year
                    match_time = date_element.get("data-venue-time")
                    # Convert match_time to datetime and extract hour and minute
                    match_time = pd.to_datetime(
                        match_time, format="%H:%M", errors="coerce"
                    )
                    hour_of_day = (
                        match_time.hour + match_time.minute / 60
                    )  # Convert to decimal (e.g., 19:30 -> 19.5)
                    # Store the values in the DataFrame
                    self.df_final.at[idx, "date_of_match"] = match_date
                    self.df_final.at[idx, "day_of_week"] = day_of_week
                    self.df_final.at[idx, "day_of_year"] = day_of_year
                    self.df_final.at[idx, "hour_of_the_match"] = match_time.strftime(
                        "%H:%M"
                    )  # Store as time
                    self.df_final.at[idx, "hour_of_day"] = (
                        hour_of_day  # Store as decimal
                    )

                # Extract team names
                teams_elements = soup.find_all("span", class_="teamandlogo")
                if len(teams_elements) >= 2:
                    home_team_name = teams_elements[0].text.strip()
                    away_team_name = teams_elements[1].text.strip()
                else:
                    print(f"Team names not found for link {link}. Assigning NaN.")
                    home_team_name = away_team_name = "NaN"
                # Remove accents and replace spaces with underscores
                home_team_name = unidecode(home_team_name).replace(" ", "_")
                away_team_name = unidecode(away_team_name).replace(" ", "_")
                # Assign the processed names to the DataFrame
                self.df_final.at[idx, "home_team_name"] = home_team_name
                self.df_final.at[idx, "away_team_name"] = away_team_name

                # Extract coaches (trainers) names
                trainers_elements = soup.find_all("div", class_="datapoint")
                trainer_count = 0
                for trainer_element in trainers_elements:
                    if "Director Técnico" in trainer_element.text:
                        trainer_name = trainer_element.text.split(":")[-1].strip()
                        if trainer_count == 0:
                            home_trainer = trainer_name
                            trainer_count += 1
                        elif trainer_count == 1:
                            away_trainer = trainer_name
                            trainer_count += 1
                if "home_trainer" not in locals():
                    print(f"Home coach not found for link {link}. Assigning NaN.")
                    home_trainer = "NaN"
                if "away_trainer" not in locals():
                    print(f"Away coach not found for link {link}. Assigning NaN.")
                    away_trainer = "NaN"
                # Remove accents and replace spaces with underscores
                home_trainer = unidecode(home_trainer).replace(" ", "_")
                away_trainer = unidecode(away_trainer).replace(" ", "_")
                # Assign the processed names to the DataFrame
                self.df_final.at[idx, "home_trainer"] = home_trainer
                self.df_final.at[idx, "away_trainer"] = away_trainer

                # List of stadiums without accents
                stadiums = [
                    "San_Mames",
                    "Estadio_de_Balaidos",
                    "Estadio_de_Mestalla",
                    "Iberostar_Estadi",
                    "Estadio_Municipal_de_Butarque",
                    "Estadio_de_la_Ceramica",
                    "Estadio_de_Mendizorroza",
                    "RCDE_Stadium",
                    "Estadio_Benito_Villamarin",
                    "Estadio_Wanda_Metropolitano",
                    "Estadio_Nuevo_Los_Carmenes",
                    "Estadio_Ciudad_de_Valencia",
                    "Estadio_El_Sadar",
                    "Estadio_Santiago_Bernabeu",
                    "Coliseum_Alfonso_Perez",
                    "Camp_Nou",
                    "Estadio_Ramon_Sanchez_Pizjuan",
                    "Estadio_Municipal_de_Anoeta",
                    "Estadio_Municipal_de_Ipurua",
                    "Estadio_Municipal_Jose_Zorrilla",
                    "Estadio_Alfredo_Di_Stefano",
                    "Estadio_Ramon_de_Carranza",
                    "Estadio_El_Alcoraz",
                    "Estadio_Manuel_Martinez_Valero",
                    "Estadio_Nuevo_Mirandilla",
                    "Estadio_del_Rayo_Vallecano",
                    "Power_Horse_Stadium",
                    "Estadio_Civitas_Metropolitano",
                    "Reale_Arena",
                    "Estadi_Municipal_de_Montilivi",
                    "Estadio_de_Gran_Canaria",
                    "Estadio_Abanca_Balaidos",
                    "Estadi_Mallorca_Son_Moix",
                    "Estadi_Olimpic_Lluis_Companys",
                    "Estadio_Municipal_de_Riazor",
                    "Estadio_La_Rosaleda"
                ]
                # List of corresponding stadium capacities
                capacities = [
                    53289,
                    24870,
                    49430,
                    26020,
                    12454,
                    23500,
                    19840,
                    40500,
                    60721,
                    68456,
                    19189,
                    26354,
                    23576,
                    78297,
                    16500,
                    99354,
                    43883,
                    40000,
                    8164,
                    27618,
                    5600,
                    20724,
                    9100,
                    31388,
                    20724,
                    14708,
                    18331,
                    68456,
                    40000,
                    14624,
                    32400,
                    24870,
                    26020,
                    55926,
                    32490,
                    30044
                ]
                # Create a dictionary of stadiums and their corresponding capacities
                stadium_capacity_dict = dict(zip(stadiums, capacities))
                # Extract stadium information
                stadium_element = soup.find("div", class_="scorebox_meta")
                if stadium_element:
                    stadium_info = stadium_element.find("strong", string="Sedes")
                    if stadium_info:
                        stadium = (
                            stadium_info.find_next("small")
                            .find_next("small")
                            .text.strip()
                        )
                    else:
                        print(f"Stadium not found for link {link}. Assigning NaN.")
                        stadium = "NaN"
                else:
                    print(f"Stadium not found for link {link}. Assigning NaN.")
                    stadium = "NaN"
                # Remove everything after the comma (including the comma itself)
                if "," in stadium:
                    stadium = stadium.split(",")[0].strip()
                # Remove accents and replace spaces with underscores
                stadium = unidecode(stadium).replace(" ", "_")
                # Assign the processed stadium name to the DataFrame
                self.df_final.at[idx, "stadium"] = stadium
                # Check if the stadium exists in the dictionary and assign the corresponding capacity
                if stadium in stadium_capacity_dict:
                    # Assign the corresponding capacity to the 'capacity' column
                    self.df_final.at[idx, "capacity"] = stadium_capacity_dict[stadium]
                else:
                    # If the stadium is not found in the dictionary, assign NaN
                    print(
                        f"Stadium '{stadium}' not found in dictionary. Assigning NaN."
                    )
                    self.df_final.at[idx, "capacity"] = np.nan

                # Extract attendance information
                attendance_element = soup.find("div", class_="scorebox_meta")
                if attendance_element:
                    attendance_info = attendance_element.find(
                        "strong", string="Asistencia"
                    )
                    if attendance_info:
                        attendance = (
                            attendance_info.find_next("small")
                            .find_next("small")
                            .text.strip()
                        )
                        try:
                            attendance = int(
                                attendance.replace(",", "").replace(".", "")
                            )
                        except ValueError:
                            attendance = 0
                else:
                    print(f"Attendance not found for link {link}. Assigning 0.")
                    attendance = 0

                # Assign attendance value to the DataFrame
                self.df_final.at[idx, "attendance"] = attendance

                # Calculate and assign attendance percentage (ensure capacity is not NaN)
                if (
                    pd.notna(self.df_final.at[idx, "capacity"])
                    and self.df_final.at[idx, "capacity"] > 0
                ):
                    attendance_percentage = (
                        attendance / self.df_final.at[idx, "capacity"]
                    )
                    # Ensure attendance% does not exceed 1
                    if attendance_percentage > 1:
                        attendance_percentage = 1
                    self.df_final.at[idx, "attendance%"] = attendance_percentage
                else:
                    self.df_final.at[idx, "attendance%"] = (
                        np.nan
                    )  # If capacity is NaN or 0, set attendance% as NaN

                # Extract referee information
                referee_element = soup.find("div", class_="scorebox_meta")
                if referee_element:
                    referee_info = referee_element.find_next(
                        "strong", string="Autoridades"
                    )
                    if referee_info:
                        referee_span = (
                            referee_info.find_next("small")
                            .find_next("small")
                            .find("span", style="display:inline-block")
                        )
                        if referee_span:
                            referee = referee_span.text.strip()
                            # Remove accents, replace spaces with underscores, and remove " (Arbitro)"
                            referee = (
                                unidecode(referee)
                                .replace(" ", "_")
                                .replace("_(Arbitro)", "")
                            )
                else:
                    print(f"Referee not found for link {link}. Assigning NaN.")
                    referee = "NaN"
                self.df_final.at[idx, "referee"] = referee

                # Extract VAR information
                var_element = soup.find("div", class_="scorebox_meta")
                if var_element:
                    var_info = var_element.find_next("strong", string="Autoridades")
                    if var_info:
                        var_span = (
                            var_info.find_next("small")
                            .find_next("small")
                            .find_next("span")
                            .find_next("span")
                            .find_next("span")
                            .find_next("span")
                            .find_next("span")
                        )
                        if var_span:
                            var = var_span.text.strip()
                            # Remove accents, replace spaces with underscores, and remove " (VAR)"
                            var = unidecode(var).replace(" ", "_").replace("_(VAR)", "")
                else:
                    print(f"VAR not found for link {link}. Assigning NaN.")
                    var = "NaN"
                self.df_final.at[idx, "var"] = var

                # Extract team lineups
                lineup_elements = soup.find_all(
                    "th", string=lambda text: text and "(" in text and ")" in text
                )
                if len(lineup_elements) >= 1:
                    home_match = re.search(r"\((.*?)\)", lineup_elements[0].text)
                    if home_match:
                        home_team_lineup = home_match.group(1)
                else:
                    print(f"Home lineup not found for link {link}. Assigning NaN.")
                    home_team_lineup = "NaN"

                if len(lineup_elements) >= 2:
                    away_match = re.search(r"\((.*?)\)", lineup_elements[1].text)
                    if away_match:
                        away_team_lineup = away_match.group(1)
                else:
                    print(f"Away lineup not found for link {link}. Assigning NaN.")
                    away_team_lineup = "NaN"

                self.df_final.at[idx, "home_team_lineup"] = home_team_lineup
                self.df_final.at[idx, "away_team_lineup"] = away_team_lineup

                # Extract team possession
                possession_header = soup.find("th", string="Posesión del balón")

                if possession_header:
                    possession_row = possession_header.find_parent(
                        "tr"
                    ).find_next_sibling("tr")

                    if possession_row:
                        possession_values = possession_row.find_all("td")

                        if len(possession_values) == 2:
                            home_possession = (
                                possession_values[0].text.strip().strip("%")
                            )
                            away_possession = (
                                possession_values[1].text.strip().strip("%")
                            )

                            self.df_final.at[idx, "home_possession"] = (
                                float(home_possession) if home_possession else None
                            )
                            self.df_final.at[idx, "away_possession"] = (
                                float(away_possession) if away_possession else None
                            )
                        else:
                            print(
                                f"Possession values not found for link {link}. Assigning NaN."
                            )
                    else:
                        print(
                            f"Possession row not found for link {link}. Assigning NaN."
                        )
                else:
                    print(
                        f"Possession header not found for link {link}. Assigning NaN."
                    )

                # Remove rows that are completely empty
                self.df_final = self.df_final.dropna(how="all")

                time.sleep(6)  # Sleep to avoid making too many requests in a short time

            except Exception as e:
                print(f"Error processing link {link}: {e}")
                continue

        self.df_matches = self.df_final
        return self.df_matches

    def save_to_csv(self, season):
        """
        Save the processed data into a CSV file, with the season included in the filename.

        Args:
            season (str): The season to be included in the filename.
        """
        # Define the filename with the season
        filename = f"matches_{season}.csv"

        try:
            # Save the DataFrame to a CSV file in the parent directory
            self.df_matches.to_csv(filename, index=False)

            # Print confirmation message with the file path
            print(f"File saved successfully as {filename}")

        except Exception as e:
            print(f"Error saving file {filename}: {e}")

    def run(self):
        """
        Execute the full process: get links, get gameweeks, get statistics, and save to CSV files for both teams.
        """
        print(f"Starting to collect matches data...")

        # Step 1: Extract the season from the URL
        season_match = re.search(
            r"(\d{4})-(\d{4})", self.url
        )  # Use self.url instead of passing it as a parameter
        if season_match:
            season = season_match.group(0)
            print(f"Season extracted: {season}")
        else:
            season = "unknown_season"  # Default value if the season cannot be extracted
            print("Season not found in the URL. Using default value 'unknown_season'.")

        # Step 2: Get all the links to the match pages from the provided URL

        # Step 3: Get the gameweek data from the provided URL

        # Step 4: Create a CSV file with match details, such as teams, dates, and other match-related information
        self.create_matches_csv()

        # Step 5: Retrieve statistics for each match, such as goals, assists, and other relevant data
        df_matches = self.get_statistics()

        # Step 6: Save the processed data into a CSV file with the season name in the filename
        self.save_to_csv(season)

        print(f"Collecting matches data process completed successfully!")

### Obtaining players and keeper data from the website

In [106]:
class Players_data:
    def __init__(self, filename, links_file, gameweeks_file):
        """
        Initializes the Players_data with links and gameweeks files.
        - links_file: Path to the CSV file containing match links.
        - gameweeks_file: Path to the CSV file containing gameweek data.
        """
        self.filename = filename
        self.links_file = links_file  # Store the path to the links file
        self.gameweeks_file = gameweeks_file  # Store the path to the gameweeks file
        self.soup = None  # Initialize BeautifulSoup object as None
        self.teams_data = {}  # Dictionary to store team information

    def fetch_page(self):
        """
        Fetches the web page content from the provided URL and initializes BeautifulSoup.
        Adds a delay to prevent overloading the server.
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "DNT": "1",
        }
        response = requests.get(self.link, headers=headers)

        if response.status_code != 200:
            raise Exception(f"Error accessing the page: {response.status_code}")

        self.soup = BeautifulSoup(response.content, "html.parser")
        time.sleep(6)

    def extract_teams_ids(self):
        """
        Extracts the IDs and names of the home and away teams using team logos.
        """
        # Verifica que el link esté definido antes de intentar acceder a la página
        if not hasattr(self, "link") or self.link is None:
            raise Exception("Link not defined for the match.")

        self.fetch_page()  # Fetch the web page content for the provided URL

        # Buscar las imágenes de los equipos
        team_imgs = self.soup.find_all("img", class_="teamlogo", src=True)

        # Verifica si se encontraron al menos dos imágenes de los equipos
        if len(team_imgs) >= 2:
            self.teams_data = {
                "home": {
                    "id": team_imgs[0]["src"].split("/")[-1].split(".")[0],
                    "name": team_imgs[0]["alt"]
                    .replace(" Club Crest", "")
                    .replace(" ", "_"),
                },
                "away": {
                    "id": team_imgs[1]["src"].split("/")[-1].split(".")[0],
                    "name": team_imgs[1]["alt"]
                    .replace(" Club Crest", "")
                    .replace(" ", "_"),
                },
            }
        else:
            raise Exception(
                "Not enough team logos found. Expected at least two team logos."
            )

    def extract_players_table(
        self, team_type, table_type, header_offset, columns_to_drop
    ):
        """
        Extracts a specific player statistics table for the given team and table type.
        - team_type: "home" or "away".
        - table_type: Type of the table (e.g., "summary", "passing").
        - header_offset: Number of header columns to skip.
        - columns_to_drop: List of columns to drop from the table.
        """
        # Get the team ID based on the team type (home or away)
        team_id = self.teams_data[team_type]["id"]

        # Construct the CSS selector for the specific table
        players_table_selector = f"#div_stats_{team_id}_{table_type}"

        # Select the table element using the constructed selector
        table = self.soup.select_one(players_table_selector)

        # Check if the table exists
        if not table:
            raise Exception(
                f"Payers table {table_type} not found for team {team_type}."
            )

        # Extract headers from the table, skipping the specified number of columns
        headers = [th.text.strip() for th in table.find("thead").find_all("th")][
            header_offset:
        ]

        # Extract rows of data from the table body
        rows = [
            [cell.text.strip() for cell in row.find_all(["td", "th"])]
            for row in table.find("tbody").find_all("tr")
        ]

        # Create a DataFrame from the extracted rows and headers
        df = pd.DataFrame(rows, columns=headers)

        # Drop unnecessary columns specified in the columns_to_drop list
        df = df.loc[:, ~df.columns.isin(columns_to_drop)]

        return df

        # Add a delay to avoid overloading the server
        time.sleep(6)

    def process_players_data(self, team_type):
        """
        Processes all player statistics tables for a specific team (home or away).
        Combines data from multiple table types into a dictionary of DataFrames.
        """
        # Define columns to drop for each table type
        columns_to_drop = {
            "summary": [
                "Gls",
                "Ass",
                "TP",
                "TPint",
                "TA",
                "TR",
                "Toques",
                "Tkl",
                "Int",
                "Bloqueos",
                "xG",
                "npxG",
                "xAG",
                "ACT",
                "ACG",
                "Cmp",
                "Int.",
                "% Cmp",
                "PrgP",
                "Transportes",
                "PrgC",
                "Att",
                "Succ",
            ],
            "passing": ["Jugador", "núm.", "País", "Posc", "Edad", "Mín"],
            "passing_types": [
                "Jugador",
                "núm.",
                "País",
                "Posc",
                "Edad",
                "Mín",
                "Int.",
                "Cmp",
            ],
            "defense": ["Jugador", "núm.", "País", "Posc", "Edad", "Mín"],
            "possession": [
                "Jugador",
                "núm.",
                "País",
                "Posc",
                "Edad",
                "Mín",
                "Tkld",
                "Tkld%",
            ],
            "misc": [
                "Jugador",
                "núm.",
                "País",
                "Posc",
                "Edad",
                "Mín",
                "Pcz",
                "PA",
                "Int",
                "TklG",
                "GC",
            ],
        }

        # Initialize an empty dictionary to store DataFrames for each table type
        tables = {}

        # Loop through each table type and extract its data
        for table_type, header_offset in [
            ("summary", 7),
            ("passing", 9),
            ("passing_types", 4),
            ("defense", 5),
            ("possession", 5),
            ("misc", 3),
        ]:
            tables[table_type] = self.extract_players_table(
                team_type,
                table_type,
                header_offset,
                columns_to_drop.get(table_type, []),
            )

        return tables

    def save_players_tables(self, match, season):
        """
        Processes and saves player statistics tables for both home and away teams.
        Combines data from all table types and writes the final table to a CSV file.
        """
        # Extract IDs and names of the teams
        self.extract_teams_ids()

        # Process data for both home and away teams
        for team_type in ["home", "away"]:
            # Get the team name
            team_name = self.teams_data[team_type]["name"]

            # Extract and process all player statistics tables for the team
            team_tables = self.process_players_data(team_type)

            # Combine all extracted tables into a single DataFrame
            final_table = pd.concat(team_tables.values(), axis=1)

            # Define the new column names for the dataset with the team_type prefix
            new_columns = [
                f"{team_type}_Players",
                f"{team_type}_Number",
                f"{team_type}_Nationality",
                f"{team_type}_Position",
                f"{team_type}_PlayersAge",
                f"{team_type}_PlayersMinutes",
                f"{team_type}_PlayersGoals",
                f"{team_type}_PlayersShots",
                f"{team_type}_PlayersShotsOnTarget",
                f"{team_type}_PlayersCompletedPasses",
                f"{team_type}_PlayersAttemptedPasses",
                f"{team_type}_Players%CompletedPasses",
                f"{team_type}_PlayersDistancePasses",
                f"{team_type}_PlayersDistanceProgression",
                f"{team_type}_PlayersShortPasses",
                f"{team_type}_PlayersAttemptedShortPasses",
                f"{team_type}_Players%ShortCompletedPasses",
                f"{team_type}_PlayersMediumPasses",
                f"{team_type}_PlayersAttemptedMediumPasses",
                f"{team_type}_Players%MediumCompletedPasses",
                f"{team_type}_PlayersLongPasses",
                f"{team_type}_PlayersAttemptedLongPasses",
                f"{team_type}_Players%LongCompletedPasses",
                f"{team_type}_PlayersAssistance",
                f"{team_type}_PlayersExpectedGoalsAssistance",
                f"{team_type}_PlayersExpectedAssistance",
                f"{team_type}_PlayersKeyPasses",
                f"{team_type}_PlayersLast1/3Passes",
                f"{team_type}_PlayersGoalAreaPasses",
                f"{team_type}_PlayersGoalAreaCrosses",
                f"{team_type}_PlayersGoalPasses",
                f"{team_type}_PlayersLiveBallPasses",
                f"{team_type}_PlayersDeadBallPasses",
                f"{team_type}_PlayersFreeKick",
                f"{team_type}_PlayersLongPasses",
                f"{team_type}_PlayersSidePasses",
                f"{team_type}_PlayersCrosses",
                f"{team_type}_PlayersStrongcrosses",
                f"{team_type}_PlayersCorner",
                f"{team_type}_PlayersCornerIn",
                f"{team_type}_PlayersCornerOut",
                f"{team_type}_PlayersCornerRect",
                f"{team_type}_PlayersOffsidePasses",
                f"{team_type}_PlayersPassesBlocked",
                f"{team_type}_PlayersTackles",
                f"{team_type}_PlayersSuccessfulTackles",
                f"{team_type}_PlayersTacklesInDefense",
                f"{team_type}_PlayersTacklesInMedium",
                f"{team_type}_PlayersTacklesInAttack",
                f"{team_type}_PlayersDribblerTackles",
                f"{team_type}_PlayersAttemptedDribblerTackles",
                f"{team_type}_Players%DribblerTacklesCompleted",
                f"{team_type}_PlayersDribblerTacklesNonCompleted",
                f"{team_type}_PlayersBallsBlocked",
                f"{team_type}_PlayersShotsBlocked",
                f"{team_type}_PlayersPassesBlocked",
                f"{team_type}_PlayersInterceptions",
                f"{team_type}_PlayersTackles+Interceptions",
                f"{team_type}_PlayersClearances",
                f"{team_type}_PlayersMistakesRivalShots",
                f"{team_type}_PlayersTouches",
                f"{team_type}_PlayersOwnPenaltyAreaTouches",
                f"{team_type}_PlayersTouchesInDefense",
                f"{team_type}_PlayersTouchesInMedium",
                f"{team_type}_PlayersTouchesInAttack",
                f"{team_type}_PlayersAwayPenaltyAreaTouches",
                f"{team_type}_PlayersLiveBallTouches",
                f"{team_type}_PlayersAttemptedDribbles",
                f"{team_type}_PlayersDribblesCompleted",
                f"{team_type}_Players%DribblesCompleted",
                f"{team_type}_PlayersBallCarries",
                f"{team_type}_PlayersDistanceCarried",
                f"{team_type}_PlayersForwardDistanceCarried",
                f"{team_type}_PlayersForwardCarries",
                f"{team_type}_PlayersCarriesInAttack",
                f"{team_type}_PlayersAwayPenaltyAreaCarries",
                f"{team_type}_PlayersLostControlCarries",
                f"{team_type}_PlayersLostCarries",
                f"{team_type}_PlayersPassesReception",
                f"{team_type}_PlayersAttackPassesReception",
                f"{team_type}_PlayersYellowCards",
                f"{team_type}_PlayersRedCards",
                f"{team_type}_PlayersSecondYellowCards",
                f"{team_type}_PlayersFouls",
                f"{team_type}_PlayersFoulsReceived",
                f"{team_type}_PlayersPenalties",
                f"{team_type}_PlayersPenaltiesConceded",
                f"{team_type}_PlayersLostBallRecoveries",
                f"{team_type}_PlayersAerialsWon",
                f"{team_type}_PlayersAerialsLost",
                f"{team_type}_Players%AerialsWon",
            ]

            # Rename the columns of the DataFrame
            final_table.columns = new_columns

            # Convert the 'Age' column to integer by extracting the first two characters
            final_table[f"{team_type}_PlayersAge"] = final_table[
                f"{team_type}_PlayersAge"
            ].apply(lambda x: int(x[:2]) if isinstance(x, str) else 0)

            # Define columns to calculate the mean
            columns_to_mean = [
                f"{team_type}_PlayersAge",
                f"{team_type}_Players%CompletedPasses",
                f"{team_type}_Players%ShortCompletedPasses",
                f"{team_type}_Players%MediumCompletedPasses",
                f"{team_type}_Players%LongCompletedPasses",
                f"{team_type}_Players%DribblerTacklesCompleted",
                f"{team_type}_Players%DribblesCompleted",
                f"{team_type}_Players%AerialsWon",
            ]

            # Define columns to calculate the sum
            columns_to_sum = [
                f"{team_type}_PlayersMinutes",
                f"{team_type}_PlayersGoals",
                f"{team_type}_PlayersShots",
                f"{team_type}_PlayersShotsOnTarget",
                f"{team_type}_PlayersCompletedPasses",
                f"{team_type}_PlayersAttemptedPasses",
                f"{team_type}_PlayersDistancePasses",
                f"{team_type}_PlayersDistanceProgression",
                f"{team_type}_PlayersShortPasses",
                f"{team_type}_PlayersAttemptedShortPasses",
                f"{team_type}_PlayersMediumPasses",
                f"{team_type}_PlayersAttemptedMediumPasses",
                f"{team_type}_PlayersLongPasses",
                f"{team_type}_PlayersAttemptedLongPasses",
                f"{team_type}_PlayersAssistance",
                f"{team_type}_PlayersExpectedGoalsAssistance",
                f"{team_type}_PlayersExpectedAssistance",
                f"{team_type}_PlayersKeyPasses",
                f"{team_type}_PlayersLast1/3Passes",
                f"{team_type}_PlayersGoalAreaPasses",
                f"{team_type}_PlayersGoalAreaCrosses",
                f"{team_type}_PlayersGoalPasses",
                f"{team_type}_PlayersLiveBallPasses",
                f"{team_type}_PlayersDeadBallPasses",
                f"{team_type}_PlayersFreeKick",
                f"{team_type}_PlayersLongPasses",
                f"{team_type}_PlayersSidePasses",
                f"{team_type}_PlayersCrosses",
                f"{team_type}_PlayersStrongcrosses",
                f"{team_type}_PlayersCorner",
                f"{team_type}_PlayersCornerIn",
                f"{team_type}_PlayersCornerOut",
                f"{team_type}_PlayersCornerRect",
                f"{team_type}_PlayersOffsidePasses",
                f"{team_type}_PlayersPassesBlocked",
                f"{team_type}_PlayersTackles",
                f"{team_type}_PlayersSuccessfulTackles",
                f"{team_type}_PlayersTacklesInDefense",
                f"{team_type}_PlayersTacklesInMedium",
                f"{team_type}_PlayersTacklesInAttack",
                f"{team_type}_PlayersDribblerTackles",
                f"{team_type}_PlayersAttemptedDribblerTackles",
                f"{team_type}_PlayersDribblerTacklesNonCompleted",
                f"{team_type}_PlayersBallsBlocked",
                f"{team_type}_PlayersShotsBlocked",
                f"{team_type}_PlayersPassesBlocked",
                f"{team_type}_PlayersInterceptions",
                f"{team_type}_PlayersTackles+Interceptions",
                f"{team_type}_PlayersClearances",
                f"{team_type}_PlayersMistakesRivalShots",
                f"{team_type}_PlayersTouches",
                f"{team_type}_PlayersOwnPenaltyAreaTouches",
                f"{team_type}_PlayersTouchesInDefense",
                f"{team_type}_PlayersTouchesInMedium",
                f"{team_type}_PlayersTouchesInAttack",
                f"{team_type}_PlayersAwayPenaltyAreaTouches",
                f"{team_type}_PlayersLiveBallTouches",
                f"{team_type}_PlayersAttemptedDribbles",
                f"{team_type}_PlayersDribblesCompleted",
                f"{team_type}_PlayersBallCarries",
                f"{team_type}_PlayersDistanceCarried",
                f"{team_type}_PlayersForwardDistanceCarried",
                f"{team_type}_PlayersForwardCarries",
                f"{team_type}_PlayersCarriesInAttack",
                f"{team_type}_PlayersAwayPenaltyAreaCarries",
                f"{team_type}_PlayersLostControlCarries",
                f"{team_type}_PlayersLostCarries",
                f"{team_type}_PlayersPassesReception",
                f"{team_type}_PlayersAttackPassesReception",
                f"{team_type}_PlayersYellowCards",
                f"{team_type}_PlayersRedCards",
                f"{team_type}_PlayersSecondYellowCards",
                f"{team_type}_PlayersFouls",
                f"{team_type}_PlayersFoulsReceived",
                f"{team_type}_PlayersPenalties",
                f"{team_type}_PlayersPenaltiesConceded",
                f"{team_type}_PlayersLostBallRecoveries",
                f"{team_type}_PlayersAerialsWon",
                f"{team_type}_PlayersAerialsLost",
            ]

            # Convert the mean columns to numeric
            for col in columns_to_mean:
                final_table[col] = final_table[col].apply(
                    pd.to_numeric, errors="coerce"
                )

            # Convert the sum columns to numeric
            for col in columns_to_sum:
                final_table[col] = final_table[col].apply(
                    pd.to_numeric, errors="coerce"
                )

            # Calculate the mean and sum for specified columns
            mean_values = final_table[columns_to_mean].mean()
            sum_values = final_table[columns_to_sum].sum()

            # Create a new row for totals with placeholder values
            total_row = {col: "-" for col in final_table.columns}

            # Populate the total row with mean values
            for col, mean in mean_values.items():
                total_row[col] = mean

            # Populate the total row with sum values
            for col, total in sum_values.items():
                total_row[col] = total

            # Add the number of rows (lines) to the first column of the total row
            num_lines = len(final_table)
            total_row[final_table.columns[0]] = num_lines

            # Check if the 'id' column exists, if not, create it with NaN values
            if "id" not in final_table.columns:
                final_table["id"] = np.nan  # Create the column with NaN values

            # Add the match ID to the total row
            total_row["id"] = match

            # Append the total row to the DataFrame
            final_table.loc[len(final_table)] = total_row

            # Define the columns to append to the existing CSV
            columns_to_append = [
                f"{team_type}_Players",
                f"{team_type}_PlayersAge",
                f"{team_type}_PlayersMinutes",
                f"{team_type}_PlayersShots",
                f"{team_type}_PlayersShotsOnTarget",
                f"{team_type}_PlayersCompletedPasses",
                f"{team_type}_PlayersAttemptedPasses",
                f"{team_type}_Players%CompletedPasses",
                f"{team_type}_PlayersDistancePasses",
                f"{team_type}_PlayersDistanceProgression",
                f"{team_type}_PlayersShortPasses",
                f"{team_type}_PlayersAttemptedShortPasses",
                f"{team_type}_Players%ShortCompletedPasses",
                f"{team_type}_PlayersMediumPasses",
                f"{team_type}_PlayersAttemptedMediumPasses",
                f"{team_type}_Players%MediumCompletedPasses",
                f"{team_type}_PlayersLongPasses",
                f"{team_type}_PlayersAttemptedLongPasses",
                f"{team_type}_Players%LongCompletedPasses",
                f"{team_type}_PlayersAssistance",
                f"{team_type}_PlayersExpectedGoalsAssistance",
                f"{team_type}_PlayersExpectedAssistance",
                f"{team_type}_PlayersKeyPasses",
                f"{team_type}_PlayersLast1/3Passes",
                f"{team_type}_PlayersGoalAreaPasses",
                f"{team_type}_PlayersGoalAreaCrosses",
                f"{team_type}_PlayersGoalPasses",
                f"{team_type}_PlayersLiveBallPasses",
                f"{team_type}_PlayersDeadBallPasses",
                f"{team_type}_PlayersFreeKick",
                f"{team_type}_PlayersLongPasses",
                f"{team_type}_PlayersSidePasses",
                f"{team_type}_PlayersCrosses",
                f"{team_type}_PlayersStrongcrosses",
                f"{team_type}_PlayersCorner",
                f"{team_type}_PlayersCornerIn",
                f"{team_type}_PlayersCornerOut",
                f"{team_type}_PlayersCornerRect",
                f"{team_type}_PlayersOffsidePasses",
                f"{team_type}_PlayersPassesBlocked",
                f"{team_type}_PlayersTackles",
                f"{team_type}_PlayersSuccessfulTackles",
                f"{team_type}_PlayersTacklesInDefense",
                f"{team_type}_PlayersTacklesInMedium",
                f"{team_type}_PlayersTacklesInAttack",
                f"{team_type}_PlayersDribblerTackles",
                f"{team_type}_PlayersAttemptedDribblerTackles",
                f"{team_type}_Players%DribblerTacklesCompleted",
                f"{team_type}_PlayersDribblerTacklesNonCompleted",
                f"{team_type}_PlayersBallsBlocked",
                f"{team_type}_PlayersShotsBlocked",
                f"{team_type}_PlayersPassesBlocked",
                f"{team_type}_PlayersInterceptions",
                f"{team_type}_PlayersTackles+Interceptions",
                f"{team_type}_PlayersClearances",
                f"{team_type}_PlayersMistakesRivalShots",
                f"{team_type}_PlayersTouches",
                f"{team_type}_PlayersOwnPenaltyAreaTouches",
                f"{team_type}_PlayersTouchesInDefense",
                f"{team_type}_PlayersTouchesInMedium",
                f"{team_type}_PlayersTouchesInAttack",
                f"{team_type}_PlayersAwayPenaltyAreaTouches",
                f"{team_type}_PlayersLiveBallTouches",
                f"{team_type}_PlayersAttemptedDribbles",
                f"{team_type}_PlayersDribblesCompleted",
                f"{team_type}_Players%DribblesCompleted",
                f"{team_type}_PlayersBallCarries",
                f"{team_type}_PlayersDistanceCarried",
                f"{team_type}_PlayersForwardDistanceCarried",
                f"{team_type}_PlayersForwardCarries",
                f"{team_type}_PlayersCarriesInAttack",
                f"{team_type}_PlayersAwayPenaltyAreaCarries",
                f"{team_type}_PlayersLostControlCarries",
                f"{team_type}_PlayersLostCarries",
                f"{team_type}_PlayersPassesReception",
                f"{team_type}_PlayersAttackPassesReception",
                f"{team_type}_PlayersYellowCards",
                f"{team_type}_PlayersRedCards",
                f"{team_type}_PlayersSecondYellowCards",
                f"{team_type}_PlayersFouls",
                f"{team_type}_PlayersFoulsReceived",
                f"{team_type}_PlayersPenalties",
                f"{team_type}_PlayersPenaltiesConceded",
                f"{team_type}_PlayersLostBallRecoveries",
                f"{team_type}_PlayersAerialsWon",
                f"{team_type}_PlayersAerialsLost",
                f"{team_type}_Players%AerialsWon",
            ]

            # Extract the last row from final_table (this contains the sums and means)
            last_row = final_table.iloc[-1][columns_to_append]

            # Define the filename for the current season
            filename = f"matches_{season}.csv"

            # Load the existing CSV file from the parent directory
            existing_df = pd.read_csv(filename)

            # Get the ID from the last row of final_table
            last_row_id = final_table.iloc[-1]["id"]

            # Find the row in the existing CSV based on the ID
            row_index = existing_df[existing_df["id"] == last_row_id].index

            # Check if the row exists
            if row_index.empty:
                raise ValueError(
                    f"Error: No row with ID {last_row_id} found in {filename}"
                )

            # Check if the columns exist in the existing CSV, if not, create them after the existing columns
            for column in columns_to_append:
                if column not in existing_df.columns:
                    existing_df[column] = pd.NA  # Create the column with missing values

            # Ensure the new columns are placed after the existing columns
            existing_columns = existing_df.columns.tolist()
            new_columns = [
                column for column in columns_to_append if column not in existing_columns
            ]
            existing_df = existing_df[existing_columns + new_columns]

            # Update the row with the new data from last_row
            for column, value in last_row.items():
                existing_df.at[row_index[0], column] = value

            # Save the updated DataFrame back to the CSV
            existing_df.to_csv(filename, index=False)

            time.sleep(6)  # Sleep to avoid making too many requests in a short time

    def extract_keeper_table(self, team_type, header_offset):
        """
        Extracts a specific keeper statistics table for the given team.
        - team_type: "home" or "away".
        - header_offset: Number of header columns to skip.
        """
        # Get the team ID based on the team type (home or away)
        team_id = self.teams_data[team_type]["id"]

        # Construct the CSS selector for the specific table
        keeper_table_selector = f"#keeper_stats_{team_id}"

        # Select the table element using the constructed selector
        table = self.soup.select_one(keeper_table_selector)

        # Check if the table exists
        if not table:
            raise Exception(f"Keeper table not found for team {team_type}.")

        # Extract headers from the table, skipping the specified number of columns
        headers = [th.text.strip() for th in table.find("thead").find_all("th")][
            header_offset:
        ]

        # Extract rows of data from the table body
        rows = [
            [cell.text.strip() for cell in row.find_all(["td", "th"])]
            for row in table.find("tbody").find_all("tr")
        ]

        # Create a DataFrame from the extracted rows and headers
        df = pd.DataFrame(rows, columns=headers)

        return df

        # Add a delay to avoid overloading the server
        time.sleep(6)

    def process_keeper_data(self, team_type):
        """
        Processes all keeper statistics tables for a specific team (home or away).
        Combines data into a dictionary of DataFrames.
        """
        # Extract data from the keeper statistics table
        table = self.extract_keeper_table(team_type, header_offset=7)

        return table

    def save_keeper_tables(self, match, season):
        """
        Processes and saves keeper statistics table for both home and away teams.
        Writes the final table to a CSV file.
        """
        # Extract IDs and names of the teams
        self.extract_teams_ids()

        # Process data for both home and away teams
        for team_type in ["home", "away"]:
            # Get the team name
            team_name = self.teams_data[team_type]["name"]

            # Extract and process all keeper statistics tables for the team
            final_table = self.process_keeper_data(team_type)

            # Define the new column names for the dataset
            new_columns = [
                f"{team_type}_KeepersKeepers",
                f"{team_type}_KeepersNationality",
                f"{team_type}_KeepersAge",
                f"{team_type}_KeepersMinutes",
                f"{team_type}_KeepersShotsOnTargetAgainst",
                f"{team_type}_KeepersGoalsAgainst",
                f"{team_type}_KeepersSaved",
                f"{team_type}_Keepers%Saved",
                f"{team_type}_KeepersxG",
                f"{team_type}_KeepersPassesLaunched",
                f"{team_type}_KeepersAttemptedPassesLaunched",
                f"{team_type}_Keepers%CompletedPassesLaunched",
                f"{team_type}_KeepersPasses",
                f"{team_type}_KeepersAttemptedPasses",
                f"{team_type}_Keepers%CompletedPasses",
                f"{team_type}_KeepersPassesDistance",
                f"{team_type}_KeepersAttemptedKicks",
                f"{team_type}_Keepers%Kicks",
                f"{team_type}_KeepersKicksDistance",
                f"{team_type}_KeepersCrosses",
                f"{team_type}_KeepersCrossesStopped",
                f"{team_type}_Keepers%CrossesStopped",
                f"{team_type}_KeepersActionsOutsideArea",
                f"{team_type}_KeepersDistanceActionsArea",
            ]

            # Rename the columns of the DataFrame
            final_table.columns = new_columns

            # Convert the 'Age' column to integer by extracting the first two characters
            final_table[f"{team_type}_KeepersAge"] = final_table[
                f"{team_type}_KeepersAge"
            ].apply(lambda x: int(x[:2]) if isinstance(x, str) else 0)

            # Define columns to calculate the mean
            columns_to_mean = [
                f"{team_type}_KeepersAge",
                f"{team_type}_Keepers%Saved",
                f"{team_type}_Keepers%CompletedPassesLaunched",
                f"{team_type}_Keepers%CompletedPasses",
                f"{team_type}_KeepersPassesDistance",
                f"{team_type}_Keepers%Kicks",
                f"{team_type}_KeepersKicksDistance",
                f"{team_type}_Keepers%CrossesStopped",
                f"{team_type}_KeepersDistanceActionsArea",
            ]

            # Define columns to calculate the sum
            columns_to_sum = [
                f"{team_type}_KeepersKeepers",
                f"{team_type}_KeepersMinutes",
                f"{team_type}_KeepersShotsOnTargetAgainst",
                f"{team_type}_KeepersGoalsAgainst",
                f"{team_type}_KeepersSaved",
                f"{team_type}_KeepersxG",
                f"{team_type}_KeepersPassesLaunched",
                f"{team_type}_KeepersAttemptedPassesLaunched",
                f"{team_type}_KeepersPasses",
                f"{team_type}_KeepersAttemptedPasses",
                f"{team_type}_KeepersAttemptedKicks",
                f"{team_type}_KeepersCrosses",
                f"{team_type}_KeepersCrossesStopped",
                f"{team_type}_KeepersActionsOutsideArea",
            ]

            # Convert the mean columns to numeric
            for col in columns_to_mean:
                final_table[col] = final_table[col].apply(
                    pd.to_numeric, errors="coerce"
                )

            # Convert the sum columns to numeric
            for col in columns_to_sum:
                final_table[col] = final_table[col].apply(
                    pd.to_numeric, errors="coerce"
                )

            # Calculate the mean and sum for specified columns
            mean_values = final_table[columns_to_mean].mean()
            sum_values = final_table[columns_to_sum].sum()

            # Create a new row for totals with placeholder values
            total_row = {col: "-" for col in final_table.columns}

            # Populate the total row with mean values
            for col, mean in mean_values.items():
                total_row[col] = mean

            # Populate the total row with sum values
            for col, total in sum_values.items():
                total_row[col] = total

            # Add the number of rows (lines) to the first column of the total row
            num_lines = len(final_table)
            total_row[final_table.columns[0]] = num_lines

            # Check if the 'id' column exists, if not, create it with NaN values
            if "id" not in final_table.columns:
                final_table["id"] = np.nan  # Create the column with NaN values

            # Add the match ID to the total row
            total_row["id"] = match

            # Append the total row to the DataFrame
            final_table.loc[len(final_table)] = total_row

            # Define the columns to append to the existing CSV
            columns_to_append = [
                f"{team_type}_KeepersKeepers",
                f"{team_type}_KeepersMinutes",
                f"{team_type}_KeepersShotsOnTargetAgainst",
                f"{team_type}_KeepersGoalsAgainst",
                f"{team_type}_KeepersSaved",
                f"{team_type}_Keepers%Saved",
                f"{team_type}_KeepersxG",
                f"{team_type}_KeepersPassesLaunched",
                f"{team_type}_KeepersAttemptedPassesLaunched",
                f"{team_type}_Keepers%CompletedPassesLaunched",
                f"{team_type}_KeepersPasses",
                f"{team_type}_KeepersAttemptedPasses",
                f"{team_type}_Keepers%CompletedPasses",
                f"{team_type}_KeepersPassesDistance",
                f"{team_type}_KeepersAttemptedKicks",
                f"{team_type}_Keepers%Kicks",
                f"{team_type}_KeepersKicksDistance",
                f"{team_type}_KeepersCrosses",
                f"{team_type}_KeepersCrossesStopped",
                f"{team_type}_Keepers%CrossesStopped",
                f"{team_type}_KeepersActionsOutsideArea",
                f"{team_type}_KeepersDistanceActionsArea",
            ]

            # Extract the last row from final_table (this contains the sums and means)
            last_row = final_table.iloc[-1][columns_to_append]

            # Define the filename for the current season
            filename = f"matches_{season}.csv"

            # Load the existing CSV file from the parent directory
            existing_df = pd.read_csv(filename)

            # Get the ID from the last row of final_table
            last_row_id = final_table.iloc[-1]["id"]

            # Find the row in the existing CSV based on the ID
            row_index = existing_df[existing_df["id"] == last_row_id].index

            # Check if the row exists
            if row_index.empty:
                raise ValueError(
                    f"Error: No row with ID {last_row_id} found in {filename}"
                )

            # Check if the columns exist in the existing CSV, if not, create them after the existing columns
            for column in columns_to_append:
                if column not in existing_df.columns:
                    existing_df[column] = pd.NA  # Create the column with missing values

            # Ensure the new columns are placed after the existing columns
            existing_columns = existing_df.columns.tolist()
            new_columns = [
                column for column in columns_to_append if column not in existing_columns
            ]
            existing_df = existing_df[existing_columns + new_columns]

            # Update the row with the new data from last_row
            for column, value in last_row.items():
                existing_df.at[row_index[0], column] = value

            # Save the updated DataFrame back to the CSV
            existing_df.to_csv(filename, index=False)

            time.sleep(6)  # Sleep to avoid making too many requests in a short time

    def run(self):
        """
        Processes multiple gameweek URLs by reading from a file of links and gameweeks.
        For each URL, extracts and saves player and keeper statistics tables.
        """
        print(f"Starting collecting players data...")

        # Read the links and gameweeks from CSV files
        links_df = pd.read_csv(self.links_file)
        gameweeks_df = pd.read_csv(self.gameweeks_file)

        # Extract the season from the filename
        season_match = re.search(r"(\d{4})-(\d{4})", self.filename)
        if season_match:
            season = season_match.group(0)
        else:
            season = "unknown_season"  # Default value if the season cannot be extracted

        # Initialize match as an integer
        match = 1  # Start match numbering from 1

        # Loop through each link and its corresponding gameweek
        for index in range(len(links_df)):
            link = links_df.iloc[index]["link"]
            gameweek = gameweeks_df.iloc[index]["gameweek"]

            print(f"Processing match {index + 1}: {link}")

            # Set the current link and gameweek for the instance
            self.link = link
            self.gameweek = gameweek

            # Extract team data for the current match
            self.extract_teams_ids()

            # Extract and save the player statistics table for the current match
            self.save_players_tables(match, season)

            # Extract and save the keeper statistics table for the current match
            self.save_keeper_tables(match, season)

            # Increment the match counter
            match += 1

            time.sleep(6)  # Sleep to avoid making too many requests in a short time

        print(f"Collecting players data process completed successfully!")

### Making ranking

In [107]:
class FootballRanking:
    def __init__(self, input_file, output_file):
        self.input_file = input_file
        self.output_file = output_file
        self.df = pd.read_csv(input_file)  # Read the CSV file
        self.team_stats = {}  # Stores points, goals for, goals against, matches played, and card counts

    def update_team_stats(self, home_team, away_team, home_goals, away_goals, 
                      home_yellow_cards, home_red_cards, home_second_yellow_cards,
                      away_yellow_cards, away_red_cards, away_second_yellow_cards):
        """Update the stats for both teams after a match."""
        
        # Initialize stats if they don't exist for home team
        if home_team not in self.team_stats:
            self.team_stats[home_team] = {
                "points": 0, "goals_for": 0, "goals_against": 0, 
                "matches_played": 0, "yellow_cards": 0, "red_cards": 0, 
                "second_yellow_cards": 0
            }

        # Initialize stats if they don't exist for away team
        if away_team not in self.team_stats:
            self.team_stats[away_team] = {
                "points": 0, "goals_for": 0, "goals_against": 0, 
                "matches_played": 0, "yellow_cards": 0, "red_cards": 0, 
                "second_yellow_cards": 0
            }
        
        # Calculate points based on the match result
        if home_goals > away_goals:  # Home team wins
            home_points = 3
            away_points = 0
        elif home_goals < away_goals:  # Away team wins
            home_points = 0
            away_points = 3
        else:  # Draw
            home_points = 1
            away_points = 1
        
        # Update stats for home team
        self.team_stats[home_team]["points"] += home_points
        self.team_stats[home_team]["goals_for"] += home_goals
        self.team_stats[home_team]["goals_against"] += away_goals
        self.team_stats[home_team]["yellow_cards"] += home_yellow_cards
        self.team_stats[home_team]["red_cards"] += home_red_cards
        self.team_stats[home_team]["second_yellow_cards"] += home_second_yellow_cards
        self.team_stats[home_team]["matches_played"] += 1
        
        # Update stats for away team
        self.team_stats[away_team]["points"] += away_points
        self.team_stats[away_team]["goals_for"] += away_goals
        self.team_stats[away_team]["goals_against"] += home_goals
        self.team_stats[away_team]["yellow_cards"] += away_yellow_cards
        self.team_stats[away_team]["red_cards"] += away_red_cards
        self.team_stats[away_team]["second_yellow_cards"] += away_second_yellow_cards
        self.team_stats[away_team]["matches_played"] += 1

    def reorder_teams_by_new_order(self, sorted_teams, new_order):
        """
        Reorders the teams in sorted_teams based on the order provided in new_order,
        while keeping the overall ranking intact.
        
        Parameters:
        - sorted_teams: List of dictionaries, where each dictionary contains team data.
        - new_order: List of tuples with team names and their respective values (e.g., [('Team A', 0), ('Team B', 0)]).

        Returns:
        - List of dictionaries with teams ordered based on new_order while keeping the rest of the teams in their original order.
        """
        # Extraer solo los nombres de los equipos de la lista de tuplas
        new_order_teams = [team[0] for team in new_order]

        # Crear un diccionario con los equipos en sorted_teams para facilitar la búsqueda
        team_dict = {team['team']: team for team in sorted_teams}

        # Filtrar los equipos que están en el nuevo orden, preservando su estructura original
        reordered_block = [team_dict[team_name] for team_name in new_order_teams if team_name in team_dict]

        # Crear una nueva lista respetando la estructura de sorted_teams
        final_sorted_teams = []
        inserted = False

        for team in sorted_teams:
            if team['team'] in new_order_teams and not inserted:
                # Insertar el grupo reordenado en la posición correcta
                final_sorted_teams.extend(reordered_block)
                inserted = True  # Evitar insertar más de una vez
            elif team['team'] not in new_order_teams:
                # Mantener los equipos que no estaban en el nuevo orden
                final_sorted_teams.append(team)

        return final_sorted_teams

    def check_tiebreaker_type(self, sorted_teams, current_gameweek, index):
        # 1. Agrupar equipos por puntos
        points_groups = {}
        for team_stats in sorted_teams:
            points = team_stats["points"]
            if points not in points_groups:
                points_groups[points] = []
            points_groups[points].append(team_stats)  # Guardar en formato {equipo: estadísticas}

        # 2. Filtrar solo los grupos con más de un equipo (es decir, los empates)
        tied_groups = {points: teams for points, teams in points_groups.items() if len(teams) > 1}

        # 3. Reordenar los grupos de equipos empatados según criterios de desempate
        for points, teams_list in tied_groups.items():
            if len(teams_list) == 2:
                # Desempatar entre dos equipos
                sorted_teams = self.apply_tiebreaker_two_teams(sorted_teams, current_gameweek, index, teams_list, points)
            elif len(teams_list) >= 3:
                # Desempatar entre tres o más equipos
                sorted_teams = self.apply_tiebreaker_multiple_teams(sorted_teams, current_gameweek, index, teams_list, points)

        return sorted_teams

    def apply_tiebreaker_two_teams(self, sorted_teams, gameweek, match_id, teams_list, points):
        if len(teams_list) != 2:
            raise ValueError("apply_tiebreaker_two_teams must receive exactly two teams.")

        team1 = teams_list[0]["team"]
        team2 = teams_list[1]["team"]

        print(f"\nChecking tiebreaker for teams: {team1} vs {team2} (Points: {points}. Gameweek: {gameweek}. Match id: {match_id})")

        # Buscar partidos entre team1 y team2 hasta el partido actual (match_id)
        total_goals_team1 = 0
        total_goals_team2 = 0

        # Filtrar el CSV para obtener los partidos entre los dos equipos previos a este match_id
        for index, row in self.df.loc[:match_id-1].iterrows():  # Iterar desde el inicio hasta el id anterior
            home_team = row["home_team_name"]
            away_team = row["away_team_name"]
            home_goals = row["home_team_goals"]
            away_goals = row["away_team_goals"]

            # Verificar si el partido es entre los dos equipos
            if (home_team == team1 and away_team == team2) or (home_team == team2 and away_team == team1):
                print(f"Found match: {home_team} vs {away_team} (Home goals: {home_goals}, Away goals: {away_goals})")

                if home_team == team1:  # Si team1 es el equipo local
                    total_goals_team1 += home_goals
                    total_goals_team2 += away_goals
                else:  # Si team2 es el equipo local
                    total_goals_team1 += away_goals
                    total_goals_team2 += home_goals

        # Mostrar los goles totales marcados en los enfrentamientos directos
        print(f"Total goals in direct encounters: {team1}: {total_goals_team1}, {team2}: {total_goals_team2}")

        # Si no están empatados en goles en enfrentamientos directos, aplicar el desempate basado en los goles marcados en enfrentamientos directos
        if total_goals_team1 > total_goals_team2:
            print(f"{team1} has more goals in direct encounters. Moving {team1} up.")
            # Verificar si están en el orden correcto
            rank_team1 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team1))
            rank_team2 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team2))

            if rank_team1 > rank_team2:
                # Ajustar las posiciones
                sorted_teams[rank_team1], sorted_teams[rank_team2] = sorted_teams[rank_team2], sorted_teams[rank_team1]
        elif total_goals_team2 > total_goals_team1:
            print(f"{team2} has more goals in direct encounters. Moving {team2} up.")
            # Verificar si están en el orden correcto
            rank_team1 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team1))
            rank_team2 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team2))

            if rank_team2 > rank_team1:
                # Ajustar las posiciones
                sorted_teams[rank_team1], sorted_teams[rank_team2] = sorted_teams[rank_team2], sorted_teams[rank_team1]
        # Si los goles en enfrentamientos directos están empatados, aplicar el segundo criterio: diferencia de goles en todos los partidos
        else:
            total_goals_for_team1 = 0
            total_goals_against_team1 = 0
            total_goals_for_team2 = 0
            total_goals_against_team2 = 0

            # Filtrar el CSV para obtener los partidos de todos los equipos hasta el match_id
            for index, row in self.df.loc[:match_id-1].iterrows():  # Iterar desde el inicio hasta el id anterior
                home_team = row["home_team_name"]
                away_team = row["away_team_name"]
                home_goals = row["home_team_goals"]
                away_goals = row["away_team_goals"]

                # Para team1
                if home_team == team1:
                    total_goals_for_team1 += home_goals
                    total_goals_against_team1 += away_goals
                elif away_team == team1:
                    total_goals_for_team1 += away_goals
                    total_goals_against_team1 += home_goals

                # Para team2
                if home_team == team2:
                    total_goals_for_team2 += home_goals
                    total_goals_against_team2 += away_goals
                elif away_team == team2:
                    total_goals_for_team2 += away_goals
                    total_goals_against_team2 += home_goals

            # Calcular la diferencia de goles para cada equipo
            diff_goals_team1 = total_goals_for_team1 - total_goals_against_team1
            diff_goals_team2 = total_goals_for_team2 - total_goals_against_team2

            print(f"Goal difference in all encounters: {team1}: {diff_goals_team1}, {team2}: {diff_goals_team2}")

            # Aplicar el desempate basado en la diferencia de goles
            if diff_goals_team1 > diff_goals_team2:
                print(f"{team1} has a better goal difference in all encounters. Moving {team1} up.")
                # Verificar si están en el orden correcto
                rank_team1 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team1))
                rank_team2 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team2))

                if rank_team1 > rank_team2:
                    # Ajustar las posiciones
                    sorted_teams[rank_team1], sorted_teams[rank_team2] = sorted_teams[rank_team2], sorted_teams[rank_team1]
            elif diff_goals_team2 > diff_goals_team1:
                print(f"{team2} has a better goal difference in all encounters. Moving {team2} up.")
                # Verificar si están en el orden correcto
                rank_team1 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team1))
                rank_team2 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team2))

                if rank_team2 > rank_team1:
                    # Ajustar las posiciones
                    sorted_teams[rank_team1], sorted_teams[rank_team2] = sorted_teams[rank_team2], sorted_teams[rank_team1]
            else:
                # Buscar los diccionarios para team1 y team2
                team1_data = next(team for team in sorted_teams if team["team"] == team1)
                total_goals_team1 = team1_data["goals_for"]

                team2_data = next(team for team in sorted_teams if team["team"] == team2)
                total_goals_team2 = team2_data["goals_for"]
                
                if total_goals_team1 > total_goals_team2:
                    print(f"{team1} has scored more total goals. Keeping {team1} up.")
                elif total_goals_team1 < total_goals_team2:
                    print(f"{team2} has scored more total goals. Moving {team2} up.")
                    # Verificar si están en el orden correcto
                    rank_team1 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team1))
                    rank_team2 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team2))

                    if rank_team2 > rank_team1:
                        # Ajustar las posiciones
                        sorted_teams[rank_team1], sorted_teams[rank_team2] = sorted_teams[rank_team2], sorted_teams[rank_team1]
                else:
                    # Si los goles son iguales, comparar el fairplay
                    total_fairplay_team1 = team1_data["fairplay"]
                    total_fairplay_team2 = team2_data["fairplay"]

                    if total_fairplay_team1 < total_fairplay_team2:
                        print(f"{team1} has better fairplay (fewer penalties). Keeping {team1} up.")
                    elif total_fairplay_team1 > total_fairplay_team2:
                        print(f"{team2} has better fairplay (fewer penalties). Moving {team2} up.")
                        # Mover el equipo 2 hacia arriba (intercambiar posiciones)
                        rank_team1 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team1))
                        rank_team2 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team2))
                        
                        if rank_team2 > rank_team1:
                            # Ajustar las posiciones
                            sorted_teams[rank_team1], sorted_teams[rank_team2] = sorted_teams[rank_team2], sorted_teams[rank_team1]
                    else:
                        # Si aún están empatados, comparar por orden alfabético
                        winner = min(team1, team2)
                        print(f"{winner} has better alphabetical order. Moving {winner} up.")
                        if winner == team2:
                            rank_team1 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team1))
                            rank_team2 = sorted_teams.index(next(team for team in sorted_teams if team['team'] == team2))
        
                            # Ajustar las posiciones según el orden alfabético
                            if rank_team1 > rank_team2:
                                sorted_teams[rank_team1], sorted_teams[rank_team2] = sorted_teams[rank_team2], sorted_teams[rank_team1]

        return sorted_teams

    def apply_tiebreaker_multiple_teams(self, sorted_teams, gameweek, match_id, teams_list, points):
        """Apply tiebreaker logic between multiple teams, considering only their matches against each other."""

        print("\n0 - Ranking before tiebreaker:")
        print(", ".join([team_data["team"] for team_data in sorted_teams]))
        print(f"\nGroup with {points} points: {', '.join([team['team'] for team in teams_list])} at gameweek: {gameweek})")

        total_goals = {team["team"]: 0 for team in teams_list}
        total_points = {team["team"]: 0 for team in teams_list}

        for _, row in self.df.iterrows():
            if row["id"] <= match_id and (row["home_team_name"] in [team["team"] for team in teams_list]) and (row["away_team_name"] in [team["team"] for team in teams_list]):
                home_team = row["home_team_name"]
                away_team = row["away_team_name"]
                home_goals = row["home_team_goals"]
                away_goals = row["away_team_goals"]

                total_goals[home_team] += home_goals
                total_goals[away_team] += away_goals

                if home_goals > away_goals:
                    total_points[home_team] += 3
                elif home_goals < away_goals:
                    total_points[away_team] += 3
                else:
                    total_points[home_team] += 1
                    total_points[away_team] += 1

        points_sorted_teams = sorted(total_points.items(), key=lambda x: x[1], reverse=True)
        
        sorted_teams = self.reorder_teams_by_new_order(sorted_teams, points_sorted_teams)

        print("\n1 - Ranking after points between teams:")
        print(", ".join([team_data["team"] for team_data in sorted_teams]))

        points_groups = {}
        for team, pts in points_sorted_teams:
            if pts not in points_groups:
                points_groups[pts] = []
            points_groups[pts].append(team)

        filtered_points_groups = {pts: teams for pts, teams in points_groups.items() if len(teams) >= 2}

        for pts, teams in sorted(filtered_points_groups.items(), reverse=True):
            if len(teams) == 2:
                team1_name, team2_name = teams
                
                team1_data = next(team for team in sorted_teams if team["team"] == team1_name)
                team2_data = next(team for team in sorted_teams if team["team"] == team2_name)

                teams_list_for_tiebreaker = [team1_data, team2_data]
                
                sorted_teams = self.apply_tiebreaker_two_teams(sorted_teams, gameweek, match_id, teams_list_for_tiebreaker, points)
            else:
                for points, teams_in_group in points_groups.items():
                    print(f"\nGroup with {pts} points between teams: {', '.join(teams_in_group)} at gameweek: {gameweek}):")
                    
                    if len(teams_in_group) == 2:
                        team1_name, team2_name = teams_in_group
                        
                        team1_data = next(team for team in sorted_teams if team["team"] == team1_name)
                        team2_data = next(team for team in sorted_teams if team["team"] == team2_name)

                        teams_list_for_tiebreaker = [team1_data, team2_data]
                        
                        sorted_teams = self.apply_tiebreaker_two_teams(sorted_teams, gameweek, match_id, teams_list_for_tiebreaker, points)
                    elif len(teams_in_group) > 2:
                        total_goal_difference = {team: total_goals[team] - sum([total_goals[other_team] for other_team in teams_in_group if other_team != team]) for team in teams_in_group}

                        sorted_teams_by_goal_difference = sorted(total_goal_difference.items(), key=lambda x: x[1], reverse=True)
                        
                        sorted_teams = self.reorder_teams_by_new_order(sorted_teams, sorted_teams_by_goal_difference)

                        print("\n2 - Ranking after difference goals between teams:")
                        print(", ".join([team_data["team"] for team_data in sorted_teams]))

                        tied_teams = {}
                        current_tie_group = []
                        last_goal_diff = sorted_teams_by_goal_difference[0][1]

                        for team, goal_diff in sorted_teams_by_goal_difference:
                            if goal_diff == last_goal_diff:
                                current_tie_group.append(team)
                            else:
                                if len(current_tie_group) >= 2:
                                    tied_teams[len(current_tie_group)] = current_tie_group
                                current_tie_group = [team]
                                last_goal_diff = goal_diff

                        if len(current_tie_group) >= 2:
                            tied_teams[len(current_tie_group)] = current_tie_group

                        for tie_size, tie_group in tied_teams.items():
                            print(f"\nGroup with {goal_diff} goals difference between teams: {tie_group} at gameweek: {gameweek}):")
                            if tie_size == 2:
                                team1_name, team2_name = tie_group
                                
                                team1_data = next(team for team in sorted_teams if team["team"] == team1_name)
                                team2_data = next(team for team in sorted_teams if team["team"] == team2_name)

                                teams_list_for_tiebreaker = [team1_data, team2_data]
                                
                                sorted_teams = self.apply_tiebreaker_two_teams(sorted_teams, gameweek, match_id, teams_list_for_tiebreaker, points)
                            elif tie_size >= 3:
                                goal_difference_total = {team: 0 for team in tie_group}

                                for _, row in self.df.iterrows():
                                    if row["id"] <= match_id and ((row["home_team_name"] in tie_group) or (row["away_team_name"] in tie_group)):
                                        home_team = row["home_team_name"]
                                        away_team = row["away_team_name"]
                                        home_goals = row["home_team_goals"]
                                        away_goals = row["away_team_goals"]

                                        if home_team in tie_group:
                                            goal_difference_total[home_team] += home_goals - away_goals
                                        if away_team in tie_group:
                                            goal_difference_total[away_team] += away_goals - home_goals

                                goal_diff_sorted_teams = sorted(goal_difference_total.items(), key=lambda x: x[1], reverse=True)

                                sorted_teams = self.reorder_teams_by_new_order(sorted_teams, goal_diff_sorted_teams)

                                print("\n3 - Ranking after difference goals global:")
                                print(", ".join([team_data["team"] for team_data in sorted_teams]))

                                tie_group = [team for team, _ in goal_diff_sorted_teams]

                                grouped_ties = {}
                                for team, goal_diff in goal_diff_sorted_teams:
                                    if goal_diff not in grouped_ties:
                                        grouped_ties[goal_diff] = []
                                    grouped_ties[goal_diff].append(team)

                                grouped_ties_with_multiple_teams = {goal_diff: teams for goal_diff, teams in grouped_ties.items() if len(teams) >= 2}

                                sorted_groups = sorted(grouped_ties_with_multiple_teams.items(), key=lambda x: x[0], reverse=True)

                                for goal_diff, teams in sorted_groups:
                                    print(f"\nGroup with {goal_diff} goals difference goals global: {teams} at gameweek: {gameweek}):")
                                    if len(teams) == 2:
                                        team1_name, team2_name = teams
                                        
                                        team1_data = next(team for team in sorted_teams if team["team"] == team1_name)
                                        team2_data = next(team for team in sorted_teams if team["team"] == team2_name)

                                        teams_list_for_tiebreaker = [team1_data, team2_data]
                                        
                                        sorted_teams = self.apply_tiebreaker_two_teams(sorted_teams, gameweek, match_id, teams_list_for_tiebreaker, points)
                                    elif len(teams) > 2:
                                        goals_for_total = {team: 0 for team in teams}

                                        for _, row in self.df.iterrows():
                                            if row["gameweek"] <= gameweek and ((row["home_team_name"] in teams) or (row["away_team_name"] in teams)):
                                                home_team = row["home_team_name"]
                                                away_team = row["away_team_name"]
                                                home_goals = row["home_team_goals"]
                                                away_goals = row["away_team_goals"]

                                                if home_team in teams:
                                                    goals_for_total[home_team] += home_goals
                                                if away_team in teams:
                                                    goals_for_total[away_team] += away_goals

                                        global_diff_sorted_teams = sorted(goals_for_total.items(), key=lambda x: x[1], reverse=True)
                                        
                                        sorted_teams = self.reorder_teams_by_new_order(sorted_teams, global_diff_sorted_teams)
                                        
                                        print("\n4 - Ranking after goals global:")
                                        print(", ".join([team_data["team"] for team_data in sorted_teams]))

                                        grouped_ties = {}
                                        for team_name, goals_for in global_diff_sorted_teams:
                                            if goals_for not in grouped_ties:
                                                grouped_ties[goals_for] = []
                                            grouped_ties[goals_for].append((team_name, goals_for))  

                                        grouped_ties_list = [teams for teams in grouped_ties.values() if len(teams) >= 2]

                                        for group in grouped_ties_list:
                                            print(f"\nGroup: {[team[0] for team in group]} at gameweek: {gameweek})")

                                            if len(group) == 2:
                                                team1_name, team2_name = [team[0] for team in group]

                                                team1_data = next(team for team in sorted_teams if team["team"] == team1_name)
                                                team2_data = next(team for team in sorted_teams if team["team"] == team2_name)

                                                teams_list_for_tiebreaker = [team1_data, team2_data]

                                                sorted_teams = self.apply_tiebreaker_two_teams(sorted_teams, gameweek, match_id, teams_list_for_tiebreaker, points)

                                                print(", ".join([team_data["team"] for team_data in sorted_teams]))

                                            elif len(group) > 2:
                                                teams_with_fairplay = []
                                                for team_name, _ in group:
                                                    team_data = next(team for team in sorted_teams if team["team"] == team_name)
                                                    fairplay = team_data["fairplay"]
                                                    teams_with_fairplay.append((team_name, fairplay))

                                                teams_with_fairplay_sorted = sorted(teams_with_fairplay, key=lambda x: x[1])

                                                sorted_teams_by_fairplay = [team[0] for team in teams_with_fairplay_sorted]

                                                group_sorted = sorted(group, key=lambda team: sorted_teams_by_fairplay.index(team[0]))

                                                sorted_teams = self.reorder_teams_by_new_order(sorted_teams, group_sorted)

                                                print("\n5 - Ranking after fairplay:")
                                                print(", ".join([team_data["team"] for team_data in sorted_teams]))

        return sorted_teams

    def calculate_fairplay(self, yellow_cards, red_cards, second_yellow_cards):
        """Calculates the fairplay score based on the card rules."""
        fairplay_score = yellow_cards + 3 * (red_cards - second_yellow_cards) + second_yellow_cards
        return fairplay_score

    def process_matches(self):
        # Inicializar variables
        current_gameweek = None
        gameweek_data = []  # Lista para almacenar los equipos y gameweek

        # Borrar el archivo CSV si ya existe antes de empezar
        if os.path.exists(self.output_file):
            os.remove(self.output_file)

        # Iterar sobre las filas del CSV
        for index, row in self.df.iterrows():
            # Obtener el valor de la columna gameweek
            gameweek = row["gameweek"]

            # Si es el primer gameweek o el mismo que el anterior, agregar los nombres de los equipos a gameweek_data
            if current_gameweek is None or gameweek == current_gameweek:
                # Llamar a la función de actualización de estadísticas para los equipos
                self.update_team_stats(
                    home_team=row['home_team_name'],
                    away_team=row['away_team_name'],
                    home_goals=row["home_team_goals"],
                    away_goals=row["away_team_goals"],
                    home_yellow_cards=row["home_PlayersYellowCards"],
                    home_red_cards=row["home_PlayersRedCards"],
                    home_second_yellow_cards=row["home_PlayersSecondYellowCards"],
                    away_yellow_cards=row["away_PlayersYellowCards"],
                    away_red_cards=row["away_PlayersRedCards"],
                    away_second_yellow_cards=row["away_PlayersSecondYellowCards"]
                )

                # Obtener las estadísticas de los equipos después de la actualización
                home_stats = self.team_stats.get(row['home_team_name'], {})
                away_stats = self.team_stats.get(row['away_team_name'], {})

                # Añadir los datos de los equipos a gameweek_data
                gameweek_data.append({
                    'gameweek': gameweek,
                    'team': row['home_team_name'], 
                    'goals': row["home_team_goals"],
                    'yellow_cards': row["home_PlayersYellowCards"],
                    'red_cards': row["home_PlayersRedCards"],
                    'second_yellow_cards': row["home_PlayersSecondYellowCards"],
                    'date_of_match': row["date_of_match"],
                    "matches_played": home_stats.get("matches_played", 0),
                    "points": home_stats.get("points", 0),
                    "goals_for": home_stats.get("goals_for", 0),
                    "goals_against": home_stats.get("goals_against", 0),
                    "goal_difference": home_stats.get("goals_for", 0) - home_stats.get("goals_against", 0),
                    "yellow_cards": home_stats.get("yellow_cards", 0),
                    "red_cards": home_stats.get("red_cards", 0),
                    "second_yellow_cards": home_stats.get("second_yellow_cards", 0),
                    "fairplay": self.calculate_fairplay(home_stats.get("yellow_cards", 0), home_stats.get("red_cards", 0), home_stats.get("second_yellow_cards", 0)),
                })

                gameweek_data.append({
                    'gameweek': gameweek, 
                    'team': row['away_team_name'],
                    'goals': row["away_team_goals"],
                    'yellow_cards': row["away_PlayersYellowCards"],
                    'red_cards': row["away_PlayersRedCards"],
                    'second_yellow_cards': row["away_PlayersSecondYellowCards"],
                    'date_of_match': row["date_of_match"],
                    "matches_played": away_stats.get("matches_played", 0),
                    "points": away_stats.get("points", 0),
                    "goals_for": away_stats.get("goals_for", 0),
                    "goals_against": away_stats.get("goals_against", 0),
                    "goal_difference": away_stats.get("goals_for", 0) - away_stats.get("goals_against", 0),
                    "yellow_cards": away_stats.get("yellow_cards", 0),
                    "red_cards": away_stats.get("red_cards", 0),
                    "second_yellow_cards": away_stats.get("second_yellow_cards", 0),
                    "fairplay": self.calculate_fairplay(away_stats.get("yellow_cards", 0), away_stats.get("red_cards", 0), away_stats.get("second_yellow_cards", 0)),
                })

                current_gameweek = gameweek  # Actualizar el gameweek

            # Si encontramos un gameweek diferente, guardar los datos del gameweek anterior y reiniciar
            elif gameweek != current_gameweek:
                # Añadir el índice a cada fila de gameweek_data
                for entry in gameweek_data:
                    entry['index'] = index  # Asignar el valor del índice a cada fila de gameweek_data

                # Ordenar gameweek_data por puntos de mayor a menor
                gameweek_data.sort(key=lambda x: x['points'], reverse=True)

                # Ordenar y aplicar el desempate
                gameweek_data = self.check_tiebreaker_type(gameweek_data, current_gameweek, index)

                # Asignar el rank basado en la posición después de ordenar
                for rank, entry in enumerate(gameweek_data, start=1):
                    entry['rank'] = rank  # Asignar el ranko

                # Guardar los datos de este gameweek en el CSV
                gameweek_df = pd.DataFrame(gameweek_data)
                gameweek_df.to_csv(self.output_file, mode='a', index=False, header=not os.path.exists(self.output_file))  # Añadir al archivo CSV
                print(f"Datos guardados para gameweek {current_gameweek}")

                # Llamar a la función de actualización de estadísticas para los equipos
                self.update_team_stats(
                    home_team=row['home_team_name'],
                    away_team=row['away_team_name'],
                    home_goals=row["home_team_goals"],
                    away_goals=row["away_team_goals"],
                    home_yellow_cards=row["home_PlayersYellowCards"],
                    home_red_cards=row["home_PlayersRedCards"],
                    home_second_yellow_cards=row["home_PlayersSecondYellowCards"],
                    away_yellow_cards=row["away_PlayersYellowCards"],
                    away_red_cards=row["away_PlayersRedCards"],
                    away_second_yellow_cards=row["away_PlayersSecondYellowCards"]
                )

                # Obtener las estadísticas de los equipos después de la actualización
                home_stats = self.team_stats.get(row['home_team_name'], {})
                away_stats = self.team_stats.get(row['away_team_name'], {})

                # Limpiar la lista y actualizar el gameweek actual
                gameweek_data = [{
                    'gameweek': gameweek,
                    'team': row['home_team_name'], 
                    'goals': row["home_team_goals"],
                    'yellow_cards': row["home_PlayersYellowCards"],
                    'red_cards': row["home_PlayersRedCards"],
                    'second_yellow_cards': row["home_PlayersSecondYellowCards"],
                    'date_of_match': row["date_of_match"],
                    "matches_played": home_stats.get("matches_played", 0),
                    "points": home_stats.get("points", 0),
                    "goals_for": home_stats.get("goals_for", 0),
                    "goals_against": home_stats.get("goals_against", 0),
                    "goal_difference": home_stats.get("goals_for", 0) - home_stats.get("goals_against", 0),
                    "yellow_cards": home_stats.get("yellow_cards", 0),
                    "red_cards": home_stats.get("red_cards", 0),
                    "second_yellow_cards": home_stats.get("second_yellow_cards", 0),
                    "fairplay": self.calculate_fairplay(home_stats.get("yellow_cards", 0), home_stats.get("red_cards", 0), home_stats.get("second_yellow_cards", 0)),
                }, {
                    'gameweek': gameweek, 
                    'team': row['away_team_name'],
                    'goals': row["away_team_goals"],
                    'yellow_cards': row["away_PlayersYellowCards"],
                    'red_cards': row["away_PlayersRedCards"],
                    'second_yellow_cards': row["away_PlayersSecondYellowCards"],
                    'date_of_match': row["date_of_match"],
                    "matches_played": away_stats.get("matches_played", 0),
                    "points": away_stats.get("points", 0),
                    "goals_for": away_stats.get("goals_for", 0),
                    "goals_against": away_stats.get("goals_against", 0),
                    "goal_difference": away_stats.get("goals_for", 0) - away_stats.get("goals_against", 0),
                    "yellow_cards": away_stats.get("yellow_cards", 0),
                    "red_cards": away_stats.get("red_cards", 0),
                    "second_yellow_cards": away_stats.get("second_yellow_cards", 0),
                    "fairplay": self.calculate_fairplay(away_stats.get("yellow_cards", 0), away_stats.get("red_cards", 0), away_stats.get("second_yellow_cards", 0)),
                }]  # Empezar nueva lista con la fila actual

                current_gameweek = gameweek  # Actualizar el gameweek

        # Si quedan filas del último gameweek, guardarlas también
        if gameweek_data:
            # Añadir el índice a cada fila de gameweek_data
            for entry in gameweek_data:
                entry['index'] = index  # Asignar el valor del índice a cada fila de gameweek_data

            # Ordenar gameweek_data por puntos de mayor a menor
            gameweek_data.sort(key=lambda x: x['points'], reverse=True)

            # Ordenar y aplicar el desempate
            gameweek_data = self.check_tiebreaker_type(gameweek_data, current_gameweek, index)

            # Asignar el rank basado en la posición después de ordenar
            for rank, entry in enumerate(gameweek_data, start=1):
                entry['rank'] = rank  # Asignar el rank

            # Guardar las filas procesadas en un nuevo archivo CSV
            gameweek_df = pd.DataFrame(gameweek_data)
            gameweek_df.to_csv(self.output_file, mode='a', index=False, header=not os.path.exists(self.output_file))  # Añadir al archivo CSV

### Ranking integration

In [108]:
class MatchRankUpdater:
    def __init__(self, matches_csv, teams_rank_csv):
        """
        Inicializa la clase con las rutas de los CSVs de los partidos y los rankings de los equipos.
        
        :param matches_csv: Ruta del archivo CSV con los partidos (home_team, away_team, date).
        :param teams_rank_csv: Ruta del archivo CSV con los equipos y rankings (team, date, rank, points).
        """
        self.matches_df = pd.read_csv(matches_csv)
        self.teams_rank_df = pd.read_csv(teams_rank_csv)
        self.matches_csv = matches_csv

        # Asegurarse de que las fechas sean del tipo datetime
        self.matches_df['date_of_match'] = pd.to_datetime(self.matches_df['date_of_match'])
        self.teams_rank_df['date_of_match'] = pd.to_datetime(self.teams_rank_df['date_of_match'])

    def _get_closest_stats(self, team, match_date):
        """
        Encuentra el ranking y los puntos más cercanos y anteriores a la fecha del partido para un equipo dado.
        """
        team_data = self.teams_rank_df[
            (self.teams_rank_df['team'] == team) & 
            (self.teams_rank_df['date_of_match'] < match_date)
        ]

        if not team_data.empty:
            closest_row = team_data.loc[team_data['date_of_match'].idxmax()]
            return closest_row['rank'], closest_row['points'], closest_row['goals_for'], closest_row['goals_against'], closest_row['goal_difference']
        
        return 0.0, 0.0, 0.0, 0.0, 0.0

    def update_and_save(self):
        """
        Actualiza el DataFrame de los partidos con los rankings y puntos de los equipos
        (home_team y away_team) y guarda el archivo CSV actualizado en el mismo archivo.
        """
        home_team_ranks = []
        away_team_ranks = []
        home_team_points = []
        away_team_points = []
        home_team_goals_for = []
        away_team_goals_for = []
        home_team_goals_againsts = []
        away_team_goals_against = []
        home_team_goal_difference = []
        away_team_goal_difference = []

        # Iterar sobre las filas del DataFrame de los partidos
        for _, match_row in self.matches_df.iterrows():
            match_date = match_row['date_of_match']  # Ya está en formato datetime
            home_team = match_row['home_team_name']
            away_team = match_row['away_team_name']
            
            # Obtener los rankings y puntos más cercanos para el home_team y away_team
            home_team_rank, home_team_pts, home_team_goal_for, home_team_goal_againsts, home_team_goals_difference = self._get_closest_stats(home_team, match_date)
            away_team_rank, away_team_pts, away_team_goal_for, away_team_goal_againsts, away_team_goals_difference = self._get_closest_stats(away_team, match_date)
            
            # Agregar los resultados a las listas
            home_team_ranks.append(home_team_rank)
            away_team_ranks.append(away_team_rank)
            home_team_points.append(home_team_pts)
            away_team_points.append(away_team_pts)
            home_team_goals_for.append(home_team_goal_for)
            away_team_goals_for.append(away_team_goal_for)
            home_team_goals_againsts.append(home_team_goal_againsts)
            away_team_goals_against.append(away_team_goal_againsts)
            home_team_goal_difference.append(home_team_goals_difference)
            away_team_goal_difference.append(away_team_goals_difference)

        # Añadir las columnas al DataFrame de partidos
        self.matches_df['home_team_rank'] = home_team_ranks
        self.matches_df['away_team_rank'] = away_team_ranks
        self.matches_df['home_team_points'] = home_team_points
        self.matches_df['away_team_points'] = away_team_points
        self.matches_df['home_team_goals_for'] = home_team_goals_for
        self.matches_df['away_team_goals_for'] = away_team_goals_for
        self.matches_df['home_team_goals_againsts'] = home_team_goals_againsts
        self.matches_df['away_team_goals_against'] = away_team_goals_against
        self.matches_df['home_team_goals_difference'] = home_team_goal_difference
        self.matches_df['away_team_goals_difference'] = away_team_goal_difference

        # Guardar el CSV actualizado en el mismo archivo
        self.matches_df.to_csv(self.matches_csv, index=False)

### Obtaining time data from the match

In [109]:
class Match_events:
    def __init__(self, url):
        """
        Initialize the extractor with the URL of the match page.
        This sets up the URL, initializes placeholders for parsed HTML content,
        and lists to store events for both teams.
        """
        self.url = url  # URL of the match page
        self.soup = None  # Placeholder for the parsed HTML content
        self.events_team_a = []  # List to store events for Team A
        self.events_team_b = []  # List to store events for Team B

    def fetch_html(self):
        """
        Fetch the HTML content from the given URL.
        This method makes an HTTP GET request to the URL and parses the HTML if the request is successful.
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",  # Indica el idioma preferido
            "Accept-Encoding": "gzip, deflate, br",  # Indica que aceptas respuestas comprimidas
            "Connection": "keep-alive",  # Mantiene la conexión abierta para mayor eficiencia
            "Upgrade-Insecure-Requests": "1",  # Indica que el cliente prefiere HTTPS
            "DNT": "1",  # Indica que no deseas ser rastreado (opcional)
        }
        response = requests.get(self.url, headers=headers)
        if response.status_code == 200:
            self.soup = BeautifulSoup(
                response.text, "html.parser"
            )  # Parse the HTML content
        else:
            raise Exception(
                f"Failed to fetch HTML content. Status code: {response.status_code}"
            )

    def parse_events(self):
        """
        Extract match events from the HTML content for both teams.
        This method locates the event container in the HTML and extracts relevant data for both teams.
        """
        # Ensure that the HTML content has been loaded
        if not self.soup:
            raise Exception("HTML content not loaded. Call 'fetch_html()' first.")

        # Locate the main container that holds all the events
        events_wrap = self.soup.find("div", id="events_wrap")
        if not events_wrap:
            raise Exception("Event container not found in the HTML.")

        # Parse events for Team A
        for event in events_wrap.find_all(
            "div", class_="event a"
        ):  # Look for events with class 'event a'
            minute = (
                event.find("small").text.strip() if event.find("small") else None
            )  # Get the minute of the event
            event_icon = event.find("div", class_="event_icon")
            event_type = (
                event_icon["class"][1] if event_icon else None
            )  # Get the type of event (e.g., goal, yellow card)
            player_tag = event.find("a")
            player = (
                player_tag.text.strip() if player_tag else None
            )  # Get the player involved in the event
            team_logo = event.find("img", class_="teamlogo")
            team = (
                team_logo["alt"].replace(" Club Crest", "").replace(" ", "_")
                if team_logo
                else None
            )  # Get the team name

            # Append the extracted details to the Team A events list
            self.events_team_a.append(
                {
                    "Minute": minute,
                    "EventType": event_type,
                    "Player": player,
                    "Team": team,
                }
            )

        # Parse events for Team B
        for event in events_wrap.find_all(
            "div", class_="event b"
        ):  # Look for events with class 'event b'
            minute = (
                event.find("small").text.strip() if event.find("small") else None
            )  # Get the minute of the event
            event_icon = event.find("div", class_="event_icon")
            event_type = (
                event_icon["class"][1] if event_icon else None
            )  # Get the type of event (e.g., goal, yellow card)
            player_tag = event.find("a")
            player = (
                player_tag.text.strip() if player_tag else None
            )  # Get the player involved in the event
            team_logo = event.find("img", class_="teamlogo")
            team = (
                team_logo["alt"].replace(" Club Crest", "").replace(" ", "_")
                if team_logo
                else None
            )  # Get the team name

            # Append the extracted details to the Team B events list
            self.events_team_b.append(
                {
                    "Minute": minute,
                    "EventType": event_type,
                    "Player": player,
                    "Team": team,
                }
            )

    def save_to_csv(self, match, gameweek):
        """
        Save the extracted events for both Team A and Team B to separate CSV files.
        This method ensures that events are saved only after they have been parsed.
        """
        if not self.events_team_a:
            raise Exception(
                "No events for Team A to save. Make sure to call 'parse_events()' first."
            )
        if not self.events_team_b:
            raise Exception(
                "No events for Team B to save. Make sure to call 'parse_events()' first."
            )

        # Convert the events for Team A and Team B to DataFrames
        events_team_a_df = pd.DataFrame(self.events_team_a)
        events_team_b_df = pd.DataFrame(self.events_team_b)

        # Get the team names for the output filenames
        team_name_a = self.events_team_a[0]["Team"]
        team_name_b = self.events_team_b[0]["Team"]

        # Define output filenames for the CSV files
        output_filename_a = f"{gameweek}_{match}_{team_name_a}_events.csv"
        output_filename_b = f"{gameweek}_{match}_{team_name_b}_events.csv"

        # Save the extracted events to CSV files for both teams
        events_team_a_df.to_csv(output_filename_a, index=False)
        events_team_b_df.to_csv(output_filename_b, index=False)

    def run(self, links_file, gameweeks_file):
        """
        Execute the full process: fetch HTML, parse events, and save to CSV files for both teams.
        This method orchestrates the entire extraction process by reading the necessary input files,
        fetching the HTML content, parsing events, and saving the results to CSV files.
        """
        print(f"Starting collecting events data...")

        # Read the links and gameweeks from CSV files
        links_df = pd.read_csv(links_file)
        gameweeks_df = pd.read_csv(gameweeks_file)

        # Initialize match as an integer
        match = 1

        # Loop through each link and its corresponding gameweek
        for index, link in enumerate(links_df["link"]):
            gameweek = gameweeks_df.iloc[index]["gameweek"]

            print(f"Processing link {index + 1}: {link}")

            # Create an extractor for the current URL and gameweek
            extractor = Match_events(link)

            # Fetch the HTML content for the match page
            extractor.fetch_html()

            # Parse the events for both teams
            extractor.parse_events()

            # Save the extracted events to CSV files for both teams
            extractor.save_to_csv(match, gameweek)

            # Increment the match counter
            match += 1

            time.sleep(6)

        print(f"Collecting events data process completed successfully!")

### Obtaining odds data

In [110]:
class Odds_betting:
    def __init__(self):
        self.betting_filename = None
        self.filename = None
        self.betting_data = None
        self.match_data = None
        self.team_name_mapping = {
            "Leganes": "Leganes",
            "Alaves": "Alaves",
            "Valencia": "Valencia",
            "Las Palmas": "Las_Palmas",
            "Celta": "Celta_Vigo",
            "Sociedad": "Real_Sociedad",
            "Ath Madrid": "Atletico_Madrid",
            "Sevilla": "Sevilla",
            "Espanol": "Espanyol",
            "Ath Bilbao": "Athletic_Club",
            "Getafe": "Getafe",
            "Barcelona": "Barcelona",
            "Betis": "Real_Betis",
            "La Coruna": "Deportivo_La_Coruna",
            "Real Madrid": "Real_Madrid",
            "Levante": "Levante",
            "Villarreal": "Villarreal",
            "Malaga": "Malaga",
            "Eibar": "Eibar",
            "Girona": "Girona"
        }

    def load_data(self, betting_filename, filename):
        self.betting_filename = betting_filename
        self.filename = filename
        self.betting_data = pd.read_csv(self.betting_filename)
        self.match_data = pd.read_csv(self.filename)
        
        # Aseguramos que los datos fueron cargados correctamente
        if self.betting_data is None or self.match_data is None:
            print("Error: Failed to load betting data or match data.")
        else:
            print("Data loaded successfully.")

    def rename_teams(self):
        if self.betting_data is not None:
            self.betting_data["HomeTeam"] = self.betting_data["HomeTeam"].replace(self.team_name_mapping)
            self.betting_data["AwayTeam"] = self.betting_data["AwayTeam"].replace(self.team_name_mapping)
        else:
            print("Error: betting data is None.")

    def extract_odds_columns(self):
        required_columns = [
            "Date","HomeTeam","AwayTeam",
            "B365H", "B365D", "B365A", 
            "BWH", "BWD", "BWA", 
            "IWH", "IWD", "IWA", 
            "LBH", "LBD", "LBA", 
            "PSH", "PSD", "PSA",
            "WHH", "WHD", "WHA",
            "VCH", "VCD", "VCA"
        ]
        
        available_columns = [col for col in required_columns if col in self.betting_data.columns]
        self.betting_data = self.betting_data[available_columns]

    def compute_avg_odds(self):
        self.betting_data["odd_home_avg"] = self.betting_data[["B365H", "BWH", "IWH", "LBH", "PSH", "WHH", "VCH"]].mean(axis=1, skipna=True)
        self.betting_data["odd_draw_avg"] = self.betting_data[["B365D", "BWD", "IWD", "LBD", "PSD", "WHD", "VCD"]].mean(axis=1, skipna=True)
        self.betting_data["odd_away_avg"] = self.betting_data[["B365A", "BWA", "IWA", "LBA", "PSA", "WHA", "VCA"]].mean(axis=1, skipna=True)
        
    def compute_probabilities(self):
        if self.betting_data["odd_home_avg"].isnull().any() or self.betting_data["odd_draw_avg"].isnull().any() or self.betting_data["odd_away_avg"].isnull().any():
            print("Warning: Some odds are missing or NaN.")

        self.betting_data["prob_home_avg"] = 1 / self.betting_data["odd_home_avg"]
        self.betting_data["prob_draw_avg"] = 1 / self.betting_data["odd_draw_avg"]
        self.betting_data["prob_away_avg"] = 1 / self.betting_data["odd_away_avg"]

        total_prob = self.betting_data["prob_home_avg"] + self.betting_data["prob_draw_avg"] + self.betting_data["prob_away_avg"]

        self.betting_data["prob_home_avg"] /= total_prob
        self.betting_data["prob_draw_avg"] /= total_prob
        self.betting_data["prob_away_avg"] /= total_prob

    def merge_with_match_data(self):
        if self.betting_data is not None and self.match_data is not None:
            merged = self.match_data.merge(
                self.betting_data[["HomeTeam", "AwayTeam", "Date", "prob_home_avg", "prob_draw_avg", "prob_away_avg"]],
                how="left",
                left_on=["home_team_name", "away_team_name"],
                right_on=["HomeTeam", "AwayTeam"]
            )

            merged.drop(columns=["HomeTeam", "AwayTeam", "Date"], inplace=True)

            self.match_data = merged

        else:
            print("❌ Error: betting_data or match_data is None.")

    def save_updated_data(self, filename):
        file_path = filename

        if self.match_data is not None:
            self.match_data.to_csv(file_path, index=False)
            print(f"Data with probabilities saved to {file_path}")
        else:
            print("Error: match_data is None. Cannot save data.")

    def process_odds(self, betting_filename, filename):
        self.load_data(betting_filename, filename)
        self.rename_teams()
        self.extract_odds_columns()
        self.compute_avg_odds()
        self.compute_probabilities()
        self.merge_with_match_data()
        self.save_updated_data(filename)

### Full process

In [111]:
def run_full_process(season, test_size, betting_filename):
    """
    Runs the full data extraction process for a given league URL.
    It involves extracting match data, player statistics, and match events.
    """
    # Define the league URL
    league_url = f"https://fbref.com/es/comps/12/{season}/horario/Marcadores-y-partidos-de-{season}-La-Liga"
    # Define the filename for the current season
    filename = f"matches_{season}.csv"

    # First class: Extract match data from the league URL
    # Create an instance of Match_data and run the extraction process
    #Match_data_extractor = Match_data(league_url, test_size)
    #Match_data_extractor.run()

    # Second class: Extract player data for each match
    # Load the existing CSV file from the parent directory
    data = pd.read_csv(filename)
    # Define file paths for temporary CSV files containing links and gameweeks
    links_file = "links_temp.csv"
    gameweeks_file = "gameweeks_temp.csv"
    # Save only the 'link' column from the match data to the links file
    data[["link"]].to_csv(links_file, index=False)
    # Save only the 'gameweek' column from the match data to the gameweeks file
    data[["gameweek"]].to_csv(gameweeks_file, index=False)
    # Create an instance of the Players_data class with the file paths
    #Players_data_extractor = Players_data(filename, links_file, gameweeks_file)
    #Players_data_extractor.run()

    # Third class: rankings 
    output_file = f"rankings_{season}.csv"
    #ranking = FootballRanking(filename, output_file)
    #ranking.process_matches()
    #updater = MatchRankUpdater(filename, output_file)
    #updater.update_and_save()

    # Fourth class: Extract match events data
    # Create an instance of Match_events and run the extraction process             Quitar los parentesis del run
    # Match_events_extractor = Match_events(league_url)
    # Match_events_extractor.run(links_file, filename)

    # Fith class: Extract betting odds data
    # Create an instance of the BettingOdds class and run the extraction process
    odds_processor = Odds_betting()
    odds_processor.process_odds(betting_filename, filename)

    # Clean up by removing the temporary CSV files after the process is complete
    os.remove(links_file)
    os.remove(gameweeks_file)

In [112]:
# Define the league URL and test size
season = "2017-2018"
test_size = 999
betting_filename = "SP1.csv"

# Run process
run_full_process(season, test_size, betting_filename)

Data loaded successfully.
Data with probabilities saved to matches_2017-2018.csv
