In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# CLEANING DATABASE

#### Reading csv

In [3]:
class MatchDataLoader:
    def __init__(self):
        """
        Initialize the MatchDataLoader class with the list of datasets to load.
        """
        self.df = None
        self.files = [
            # "../../initial/Footystats/spain-la-liga-matches-2008-to-2009-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2009-to-2010-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2010-to-2011-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2011-to-2012-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2012-to-2013-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2013-to-2014-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2014-to-2015-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2015-to-2016-stats.csv",
            # "../../initial/Footystats/spain-la-liga-matches-2016-to-2017-stats.csv",
            "2017-2018/matches_2017-2018.csv",
            "2018-2019/matches_2018-2019.csv",
            "2019-2020/matches_2019-2020.csv",
            "2020-2021/matches_2020-2021.csv",
            "2021-2022/matches_2021-2022.csv",
            "2022-2023/matches_2022-2023.csv",
            "2023-2024/matches_2023-2024.csv",
            "2024-2025/matches_2024-2025.csv"
        ]
        
        self.columns_to_keep = [
            "gameweek",
            "date_of_match",
            "day_of_week",
            "day_of_year",
            "hour_of_day",
            "home_team_name",
            "away_team_name",
            "home_trainer",
            "away_trainer",
            "stadium",
            "attendance",
            "attendance%",
            "referee",
            "var",
            #"home_team_lineup",
            #"away_team_lineup",
            #"home_possession",
            #"away_possession",
            "home_team_goals",
            "away_team_goals",
            "result",
            #"home_Players",
            #"home_PlayersAge",
            #"home_PlayersMinutes",
            #"home_PlayersShots",
            #"home_PlayersShotsOnTarget",
            #"home_PlayersCompletedPasses",
            #"home_PlayersAttemptedPasses",
            #"home_Players%CompletedPasses",
            #"home_PlayersDistancePasses",
            #"home_PlayersDistanceProgression",
            #"home_PlayersShortPasses",
            #"home_PlayersAttemptedShortPasses",
            #"home_Players%ShortCompletedPasses",
            #"home_PlayersMediumPasses",
            #"home_PlayersAttemptedMediumPasses",
            #"home_Players%MediumCompletedPasses",
            #"home_PlayersLongPasses",
            #"home_PlayersAttemptedLongPasses",
            #"home_Players%LongCompletedPasses",
            #"home_PlayersAssistance",
            #"home_PlayersExpectedGoalsAssistance",
            #"home_PlayersExpectedAssistance",
            #"home_PlayersKeyPasses",
            #"home_PlayersLast1/3Passes",
            #"home_PlayersGoalAreaPasses",
            #"home_PlayersGoalAreaCrosses",
            #"home_PlayersGoalPasses",
            #"home_PlayersLiveBallPasses",
            #"home_PlayersDeadBallPasses",
            #"home_PlayersFreeKick",
            #"home_PlayersSidePasses",
            #"home_PlayersCrosses",
            #"home_PlayersStrongcrosses",
            #"home_PlayersCorner",
            #"home_PlayersCornerIn",
            #"home_PlayersCornerOut",
            #"home_PlayersCornerRect",
            #"home_PlayersOffsidePasses",
            #"home_PlayersPassesBlocked",
            #"home_PlayersTackles",
            #"home_PlayersSuccessfulTackles",
            #"home_PlayersTacklesInDefense",
            #"home_PlayersTacklesInMedium",
            #"home_PlayersTacklesInAttack",
            #"home_PlayersDribblerTackles",
            #"home_PlayersAttemptedDribblerTackles",
            #"home_Players%DribblerTacklesCompleted",
            #"home_PlayersDribblerTacklesNonCompleted",
            #"home_PlayersBallsBlocked",
            #"home_PlayersShotsBlocked",
            #"home_PlayersInterceptions",
            #"home_PlayersTackles+Interceptions",
            #"home_PlayersClearances",
            #"home_PlayersMistakesRivalShots",
            #"home_PlayersTouches",
            #"home_PlayersOwnPenaltyAreaTouches",
            #"home_PlayersTouchesInDefense",
            #"home_PlayersTouchesInMedium",
            #"home_PlayersTouchesInAttack",
            #"home_PlayersAwayPenaltyAreaTouches",
            #"home_PlayersLiveBallTouches",
            #"home_PlayersAttemptedDribbles",
            #"home_PlayersDribblesCompleted",
            #"home_Players%DribblesCompleted",
            #"home_PlayersBallCarries",
            #"home_PlayersDistanceCarried",
            #"home_PlayersForwardDistanceCarried",
            #"home_PlayersForwardCarries",
            #"home_PlayersCarriesInAttack",
            #"home_PlayersAwayPenaltyAreaCarries",
            #"home_PlayersLostControlCarries",
            #"home_PlayersLostCarries",
            #"home_PlayersPassesReception",
            #"home_PlayersAttackPassesReception",
            #"home_PlayersYellowCards",
            #"home_PlayersRedCards",
            #"home_PlayersSecondYellowCards",
            #"home_PlayersFouls",
            #"home_PlayersFoulsReceived",
            #"home_PlayersPenalties",
            #"home_PlayersPenaltiesConceded",
            #"home_PlayersLostBallRecoveries",
            #"home_PlayersAerialsWon",
            #"home_PlayersAerialsLost",
            #"home_Players%AerialsWon",
            #"away_Players",
            #"away_PlayersAge",
            #"away_PlayersMinutes",
            #"away_PlayersShots",
            #"away_PlayersShotsOnTarget",
            #"away_PlayersCompletedPasses",
            #"away_PlayersAttemptedPasses",
            #"away_Players%CompletedPasses",
            #"away_PlayersDistancePasses",
            #"away_PlayersDistanceProgression",
            #"away_PlayersShortPasses",
            #"away_PlayersAttemptedShortPasses",
            #"away_Players%ShortCompletedPasses",
            #"away_PlayersMediumPasses",
            #"away_PlayersAttemptedMediumPasses",
            #"away_Players%MediumCompletedPasses",
            #"away_PlayersLongPasses",
            #"away_PlayersAttemptedLongPasses",
            #"away_Players%LongCompletedPasses",
            #"away_PlayersAssistance",
            #"away_PlayersExpectedGoalsAssistance",
            #"away_PlayersExpectedAssistance",
            #"away_PlayersKeyPasses",
            #"away_PlayersLast1/3Passes",
            #"away_PlayersGoalAreaPasses",
            #"away_PlayersGoalAreaCrosses",
            #"away_PlayersGoalPasses",
            #"away_PlayersLiveBallPasses",
            #"away_PlayersDeadBallPasses",
            #"away_PlayersFreeKick",
            #"away_PlayersSidePasses",
            #"away_PlayersCrosses",
            #"away_PlayersStrongcrosses",
            #"away_PlayersCorner",
            #"away_PlayersCornerIn",
            #"away_PlayersCornerOut",
            #"away_PlayersCornerRect",
            #"away_PlayersOffsidePasses",
            #"away_PlayersPassesBlocked",
            #"away_PlayersTackles",
            #"away_PlayersSuccessfulTackles",
            #"away_PlayersTacklesInDefense",
            #"away_PlayersTacklesInMedium",
            #"away_PlayersTacklesInAttack",
            #"away_PlayersDribblerTackles",
            #"away_PlayersAttemptedDribblerTackles",
            #"away_Players%DribblerTacklesCompleted",
            #"away_PlayersDribblerTacklesNonCompleted",
            #"away_PlayersBallsBlocked",
            #"away_PlayersShotsBlocked",
            #"away_PlayersInterceptions",
            #"away_PlayersTackles+Interceptions",
            #"away_PlayersClearances",
            #"away_PlayersMistakesRivalShots",
            #"away_PlayersTouches",
            #"away_PlayersOwnPenaltyAreaTouches",
            #"away_PlayersTouchesInDefense",
            #"away_PlayersTouchesInMedium",
            #"away_PlayersTouchesInAttack",
            #"away_PlayersAwayPenaltyAreaTouches",
            #"away_PlayersLiveBallTouches",
            #"away_PlayersAttemptedDribbles",
            #"away_PlayersDribblesCompleted",
            #"away_Players%DribblesCompleted",
            #"away_PlayersBallCarries",
            #"away_PlayersDistanceCarried",
            #"away_PlayersForwardDistanceCarried",
            #"away_PlayersForwardCarries",
            #"away_PlayersCarriesInAttack",
            #"away_PlayersAwayPenaltyAreaCarries",
            #"away_PlayersLostControlCarries",
            #"away_PlayersLostCarries",
            #"away_PlayersPassesReception",
            #"away_PlayersAttackPassesReception",
            #"away_PlayersYellowCards",
            #"away_PlayersRedCards",
            #"away_PlayersSecondYellowCards",
            #"away_PlayersFouls",
            #"away_PlayersFoulsReceived",
            #"away_PlayersPenalties",
            #"away_PlayersPenaltiesConceded",
            #"away_PlayersLostBallRecoveries",
            #"away_PlayersAerialsWon",
            #"away_PlayersAerialsLost",
            #"away_Players%AerialsWon",
            #"home_KeepersKeepers",
            #"home_KeepersMinutes",
            #"home_KeepersShotsOnTargetAgainst",
            #"home_KeepersGoalsAgainst",
            #"home_KeepersSaved",
            #"home_Keepers%Saved",
            #"home_KeepersxG",
            #"home_KeepersPassesLaunched",
            #"home_KeepersAttemptedPassesLaunched",
            #"home_Keepers%CompletedPassesLaunched",
            #"home_KeepersPasses",
            #"home_KeepersAttemptedPasses",
            #"home_Keepers%CompletedPasses",
            #"home_KeepersPassesDistance",
            #"home_KeepersAttemptedKicks",
            #"home_Keepers%Kicks",
            #"home_KeepersKicksDistance",
            #"home_KeepersCrosses",
            #"home_KeepersCrossesStopped",
            #"home_Keepers%CrossesStopped",
            #"home_KeepersActionsOutsideArea",
            #"home_KeepersDistanceActionsArea",
            #"away_KeepersKeepers",
            #"away_KeepersMinutes",
            #"away_KeepersShotsOnTargetAgainst",
            #"away_KeepersGoalsAgainst",
            #"away_KeepersSaved",
            #"away_Keepers%Saved",
            #"away_KeepersxG",
            #"away_KeepersPassesLaunched",
            #"away_KeepersAttemptedPassesLaunched",
            #"away_Keepers%CompletedPassesLaunched",
            #"away_KeepersPasses",
            #"away_KeepersAttemptedPasses",
            #"away_Keepers%CompletedPasses",
            #"away_KeepersPassesDistance",
            #"away_KeepersAttemptedKicks",
            #"away_Keepers%Kicks",
            #"away_KeepersKicksDistance",
            #"away_KeepersCrosses",
            #"away_KeepersCrossesStopped",
            #"away_Keepers%CrossesStopped",
            #"away_KeepersActionsOutsideArea",
            #"away_KeepersDistanceActionsArea",
            "home_team_rank",
            "away_team_rank",
            "home_team_points",
            "away_team_points",
            "home_team_goals_for",
            "away_team_goals_for",
            "home_team_goals_againsts",
            "away_team_goals_against",
            "home_team_goals_difference",
            "away_team_goals_difference",
            "prob_home_avg",
            "prob_draw_avg",
            "prob_away_avg"
        ]
    
    def load_data(self):
        """
        Load and concatenate match data from multiple CSV files.
        """
        df_list = []
        for file in self.files:
            if os.path.exists(file):
                df_list.append(pd.read_csv(file))  # Read each CSV file
            else:
                print(f"Warning: File {file} not found. Skipping...")  # Handle missing files

        if df_list:
            self.df = pd.concat(df_list, ignore_index=True)  # Concatenate all dataframes
        else:
            print("No valid files were loaded.")
            self.df = pd.DataFrame()  # Return an empty DataFrame to avoid errors
    
    def sort_data(self):
        """
        Sort the dataset by date and match hour.
        """
        if self.df is not None and not self.df.empty:
            self.df = self.df.sort_values(by=["date_of_match", "hour_of_day"], ascending=True)  # Sort by match date and time
        else:
            raise ValueError("Data has not been loaded yet or is empty.")  # Raise an error if data is not loaded or empty
    
    def filter_columns(self):
        """
        Keep only the necessary columns in the dataset.
        """
        if self.df is not None and not self.df.empty:
            self.df = self.df[self.columns_to_keep]  # Filter the columns to keep only the necessary ones
        else:
            raise ValueError("Data has not been loaded yet or is empty.")  # Raise an error if data is not loaded or empty
    
    def process_data(self):
        """
        Run the full data processing pipeline: load, sort, and filter.
        """
        self.load_data()  # Load the data from the files
        if self.df.empty:
            print("No data loaded. Exiting process.")
            return  # Exit if no data is loaded
        self.sort_data()  # Sort the data by date and match hour
        self.filter_columns()  # Filter the columns to keep only the necessary ones
    
    def get_dataframe(self):
        """
        Run the full data processing pipeline and return the processed DataFrame.
        """
        if self.df is None or self.df.empty:
            self.process_data()  # Run the entire data processing pipeline if data is not yet loaded or is empty
        return self.df  # Return the processed dataframe

#### Working on avg results

In [4]:
class ResultStrikeStats:
    def __init__(self, df, long_strike, medium_strike, short_strike):
        """
        Initialize the ResultStrikeStats class with a dataset and strike values.
        
        Args:
            df (pd.DataFrame): The DataFrame to be used for analysis.
            long_strike (int): Number of matches for long-term analysis.
            medium_strike (int): Number of matches for medium-term analysis.
            short_strike (int): Number of matches for short-term analysis.
        """
        if df is None or not isinstance(df, pd.DataFrame):
            raise ValueError("The provided df is not a valid DataFrame.")
        self.df = df
        self.long_strike = long_strike
        self.medium_strike = medium_strike
        self.short_strike = short_strike

    def calculate_avg_wins(self, strike, team, current_date):
        """
        Calculate the average wins for a team over the last 'strike' matches.
        
        Args:
            strike (int): Number of matches to consider.
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Average wins over the last 'strike' matches.
        """
        team_matches = self.df[ 
            (self.df["date_of_match"] < current_date) & 
            ((self.df["home_team_name"] == team) | (self.df["away_team_name"] == team))
        ]
        team_matches = team_matches.sort_values(by="date_of_match")
        
        if len(team_matches) < strike:
            return np.nan
        
        team_matches = team_matches.tail(strike)
        wins = (team_matches["result"] == 1).mean()
        
        return wins
    
    def calculate_avg_wins_home(self, strike, team, current_date):
        """
        Calculate the average wins for a team when playing at home.
        
        Args:
            strike (int): Number of matches to consider.
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Average wins at home over the last 'strike' matches.
        """
        team_matches = self.df[
            (self.df["date_of_match"] < current_date) & (self.df["home_team_name"] == team)
        ]
        team_matches = team_matches.sort_values(by="date_of_match")
        
        if len(team_matches) < strike:
            return np.nan
        
        team_matches = team_matches.tail(strike)
        wins = (team_matches["result"] == 1).mean()
        
        return wins
    
    def calculate_avg_wins_away(self, strike, team, current_date):
        """
        Calculate the average wins for a team when playing away.
        
        Args:
            strike (int): Number of matches to consider.
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Average wins away over the last 'strike' matches.
        """
        team_matches = self.df[
            (self.df["date_of_match"] < current_date) & (self.df["away_team_name"] == team)
        ]
        team_matches = team_matches.sort_values(by="date_of_match")
        
        if len(team_matches) < strike:
            return np.nan
        
        team_matches = team_matches.tail(strike)
        wins = (team_matches["result"] == 1).mean()
        
        return wins

    def calculate_consecutive_wins_losses(self, team, current_date, location='both'):
        """
        Calculate how many consecutive wins or losses a team has had before a given match.
        
        Args:
            team (str): Name of the team.
            current_date (str): Date of the current match.
            location (str): 'both', 'home', or 'away' to calculate streak for the specified location.
        
        Returns:
            tuple: (consecutive_wins, consecutive_losses)
        """
        if location == 'home':
            team_matches = self.df[ 
                (self.df["date_of_match"] < current_date) & 
                (self.df["home_team_name"] == team)
            ]
        elif location == 'away':
            team_matches = self.df[ 
                (self.df["date_of_match"] < current_date) & 
                (self.df["away_team_name"] == team)
            ]
        else:
            team_matches = self.df[ 
                (self.df["date_of_match"] < current_date) & 
                ((self.df["home_team_name"] == team) | (self.df["away_team_name"] == team))
            ]
        
        team_matches = team_matches.sort_values(by="date_of_match")
        
        consecutive_wins = 0
        consecutive_losses = 0
        
        for index, row in team_matches.iterrows():
            result = row["result"]
            if result == 1:  # Win
                consecutive_wins += 1
                consecutive_losses = 0  # Reset losses when a win is found
            elif result == 0:  # Loss
                consecutive_losses += 1
                consecutive_wins = 0  # Reset wins when a loss is found
            else:  # Draw, reset both
                break
        
        return consecutive_wins, consecutive_losses
    
    def calculate_win_percentage_all(self, team, current_date):
        """
        Calculate the win percentage for a team over all its previous matches before the current date.
        
        Args:
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Win percentage for the team over all its previous matches.
        """
        team_matches = self.df[ 
            ((self.df["home_team_name"] == team) | (self.df["away_team_name"] == team)) & 
            (self.df["date_of_match"] < current_date)
        ]
        
        if len(team_matches) == 0:
            return np.nan
        
        wins = (team_matches["result"] == 1).sum()
        total_matches = len(team_matches)
        
        win_percentage = (wins / total_matches) * 100
        return win_percentage

    def calculate_win_percentage_home(self, team, current_date):
        """
        Calculate the win percentage for a team in home matches before the current date.
        
        Args:
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Win percentage for the team in home matches.
        """
        team_matches = self.df[ 
            (self.df["home_team_name"] == team) & (self.df["date_of_match"] < current_date)
        ]
        
        if len(team_matches) == 0:
            return np.nan
        
        wins = (team_matches["result"] == 1).sum()
        total_matches = len(team_matches)
        
        win_percentage = (wins / total_matches) * 100
        return win_percentage

    def calculate_win_percentage_away(self, team, current_date):
        """
        Calculate the win percentage for a team in away matches before the current date.
        
        Args:
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Win percentage for the team in away matches.
        """
        team_matches = self.df[ 
            (self.df["away_team_name"] == team) & (self.df["date_of_match"] < current_date)
        ]
        
        if len(team_matches) == 0:
            return np.nan
        
        wins = (team_matches["result"] == 1).sum()
        total_matches = len(team_matches)
        
        win_percentage = (wins / total_matches) * 100
        return win_percentage

    def run(self):
        """
        Compute average results and update the dataset with new columns.
        """
        strikes = [self.long_strike, self.medium_strike, self.short_strike]
        
        # Create new columns in the existing DataFrame with NaN as default values
        self.df["home_team_consecutive_wins_global"] = np.nan
        self.df["home_team_consecutive_losses_global"] = np.nan
        self.df["away_team_consecutive_wins_global"] = np.nan
        self.df["away_team_consecutive_losses_global"] = np.nan
        self.df["home_team_consecutive_wins_home"] = np.nan
        self.df["home_team_consecutive_losses_home"] = np.nan
        self.df["away_team_consecutive_wins_away"] = np.nan
        self.df["away_team_consecutive_losses_away"] = np.nan
        self.df["home_team_win_percentage_all"] = np.nan
        self.df["home_team_win_percentage_home"] = np.nan
        self.df["away_team_win_percentage_all"] = np.nan
        self.df["away_team_win_percentage_away"] = np.nan
        
        for strike in strikes:
            self.df[f"avg_home_wins_last_{strike}"] = np.nan
            self.df[f"avg_away_wins_last_{strike}"] = np.nan
            self.df[f"avg_home_wins_last_{strike}_home"] = np.nan
            self.df[f"avg_away_wins_last_{strike}_away"] = np.nan
        
        # Process each match in the DataFrame
        for index, row in self.df.iterrows():
            home_team = row["home_team_name"]
            away_team = row["away_team_name"]
            current_date = row["date_of_match"]
            
            for strike in strikes:
                # Calculate average wins for the home team
                self.df.at[index, f"avg_home_wins_last_{strike}"] = self.calculate_avg_wins(strike, home_team, current_date)
                
                # Calculate average wins for the away team
                self.df.at[index, f"avg_away_wins_last_{strike}"] = self.calculate_avg_wins(strike, away_team, current_date)
                
                # Calculate average home wins when playing at home
                self.df.at[index, f"avg_home_wins_last_{strike}_home"] = self.calculate_avg_wins_home(strike, home_team, current_date)
                
                # Calculate average away wins when playing away
                self.df.at[index, f"avg_away_wins_last_{strike}_away"] = self.calculate_avg_wins_away(strike, away_team, current_date)

                # Calculate consecutive wins and losses for the home team globally
                home_consecutive_wins_global, home_consecutive_losses_global = self.calculate_consecutive_wins_losses(home_team, current_date, location='both')
                self.df.at[index, "home_team_consecutive_wins_global"] = home_consecutive_wins_global
                self.df.at[index, "home_team_consecutive_losses_global"] = home_consecutive_losses_global
                
                # Calculate consecutive wins and losses for the away team globally
                away_consecutive_wins_global, away_consecutive_losses_global = self.calculate_consecutive_wins_losses(away_team, current_date, location='both')
                self.df.at[index, "away_team_consecutive_wins_global"] = away_consecutive_wins_global
                self.df.at[index, "away_team_consecutive_losses_global"] = away_consecutive_losses_global
                
                # Calculate consecutive wins and losses for the home team at home
                home_consecutive_wins_home, home_consecutive_losses_home = self.calculate_consecutive_wins_losses(home_team, current_date, location='home')
                self.df.at[index, "home_team_consecutive_wins_home"] = home_consecutive_wins_home
                self.df.at[index, "home_team_consecutive_losses_home"] = home_consecutive_losses_home
                
                # Calculate consecutive wins and losses for the away team away
                away_consecutive_wins_away, away_consecutive_losses_away = self.calculate_consecutive_wins_losses(away_team, current_date, location='away')
                self.df.at[index, "away_team_consecutive_wins_away"] = away_consecutive_wins_away
                self.df.at[index, "away_team_consecutive_losses_away"] = away_consecutive_losses_away

                # Calculate the global win percentage for the home team (all matches before the current date)
                self.df.at[index, "home_team_win_percentage_all"] = self.calculate_win_percentage_all(home_team, current_date)
                
                # Calculate the win percentage for the home team (home matches only, before the current date)
                self.df.at[index, "home_team_win_percentage_home"] = self.calculate_win_percentage_home(home_team, current_date)
                
                # Calculate the global win percentage for the away team (all matches before the current date)
                self.df.at[index, "away_team_win_percentage_all"] = self.calculate_win_percentage_all(away_team, current_date)
                
                # Calculate the win percentage for the away team (away matches only, before the current date)
                self.df.at[index, "away_team_win_percentage_away"] = self.calculate_win_percentage_away(away_team, current_date)
        
        # Ensure all empty or missing values are explicitly NaN
        self.df.fillna(value=np.nan, inplace=True)

#### Adding columns attendance avg

In [5]:
class AttendanceStrikeStats:
    def __init__(self, df, long_strike, medium_strike, short_strike):
        """
        Initialize the AttendanceStrikeStats class with a dataset and strike values.
        
        Args:
            df (pd.DataFrame): The DataFrame to be used for analysis.
            long_strike (int): Number of matches for long-term analysis.
            medium_strike (int): Number of matches for medium-term analysis.
            short_strike (int): Number of matches for short-term analysis.
        """
        if df is None or not isinstance(df, pd.DataFrame):
            raise ValueError("The provided df is not a valid DataFrame.")
        self.df = df
        self.long_strike = long_strike
        self.medium_strike = medium_strike
        self.short_strike = short_strike
    
    def calculate_avg_attendance_home(self, strike, team, current_date):
        """
        Calculate the average attendance for a team when playing at home.
        
        Args:
            strike (int): Number of matches to consider.
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Average attendance at home over the last 'strike' matches.
        """
        # Filter matches before the current date where the team played at home
        team_matches = self.df[
            (self.df["date_of_match"] < current_date) & (self.df["home_team_name"] == team)
        ]
        
        # Sort by match date to ensure the latest matches are considered
        team_matches = team_matches.sort_values(by="date_of_match")
        
        # If there are fewer matches than the 'strike' value, return NaN
        if len(team_matches) < strike:
            return np.nan
        
        # Select the last 'strike' number of matches
        team_matches = team_matches.tail(strike)
        
        # Calculate the average attendance (assuming 'attendance' is a numeric column)
        avg_attendance = team_matches["attendance"].mean()
        
        return avg_attendance

    def calculate_avg_attendance_per_home(self, strike, team, current_date):
        """
        Calculate the average attendance for a team when playing at home.
        
        Args:
            strike (int): Number of matches to consider.
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Average attendance at home over the last 'strike' matches.
        """
        # Filter matches before the current date where the team played at home
        team_matches = self.df[
            (self.df["date_of_match"] < current_date) & (self.df["home_team_name"] == team)
        ]
        
        # Sort by match date to ensure the latest matches are considered
        team_matches = team_matches.sort_values(by="date_of_match")
        
        # If there are fewer matches than the 'strike' value, return NaN
        if len(team_matches) < strike:
            return np.nan
        
        # Select the last 'strike' number of matches
        team_matches = team_matches.tail(strike)

        # Calculate the average attendance (assuming 'attendance%' is a numeric column)
        avg_attendance_per = team_matches["attendance%"].mean()
        
        return avg_attendance_per
    
    def calculate_avg_attendance_per_away(self, strike, team, current_date):
        """
        Calculate the average attendance for a team when playing at away.
        
        Args:
            strike (int): Number of matches to consider.
            team (str): Name of the team.
            current_date (str): Date of the current match.
        
        Returns:
            float: Average attendance at away over the last 'strike' matches.
        """
        # Filter matches before the current date where the team played at away
        team_matches = self.df[
            (self.df["date_of_match"] < current_date) & (self.df["away_team_name"] == team)
        ]
        
        # Sort by match date to ensure the latest matches are considered
        team_matches = team_matches.sort_values(by="date_of_match")
        
        # If there are fewer matches than the 'strike' value, return NaN
        if len(team_matches) < strike:
            return np.nan
        
        # Select the last 'strike' number of matches
        team_matches = team_matches.tail(strike)

        # Calculate the average attendance (assuming 'attendance%' is a numeric column)
        avg_attendance_per = team_matches["attendance%"].mean()

        return avg_attendance_per

    def run(self):
        """
        Compute average results and update the dataset with new columns.
        """
        strikes = [self.long_strike, self.medium_strike, self.short_strike]
        
        # Create new columns in the existing DataFrame with NaN as default values
        for strike in strikes:
            self.df[f"avg_attendance_{strike}_home"] = np.nan
            self.df[f"avg_attendance%_{strike}_home"] = np.nan
            self.df[f"avg_attendance%_{strike}_away"] = np.nan
        
        # Process each match in the DataFrame
        for index, row in self.df.iterrows():
            home_team = row["home_team_name"]
            away_team = row["away_team_name"]
            current_date = row["date_of_match"]
            
            for strike in strikes:
                # Calculate average attendance for the home team
                self.df.at[index, f"avg_attendance_{strike}_home"] = self.calculate_avg_attendance_home(strike, home_team, current_date)
                
                # Calculate average attendance% for the home team
                self.df.at[index, f"avg_attendance%_{strike}_home"] = self.calculate_avg_attendance_per_home(strike, home_team, current_date)
                
                # Calculate average attendance% for the away team
                self.df.at[index, f"avg_attendance%_{strike}_away"] = self.calculate_avg_attendance_per_away(strike, away_team, current_date)
        
        # Ensure all empty or missing values are explicitly NaN
        self.df.fillna(value=np.nan, inplace=True)

#### Filtering with minimum matches

In [6]:
def FilterTeamsByMinMatches(data, min_matches):
    """
    Remove teams that have played fewer than min_matches games (home + away).
    """
    team_match_counts = pd.concat([data['home_team_name'], data['away_team_name']]).value_counts()
    valid_teams = team_match_counts[team_match_counts >= min_matches].index
    
    return data[(data['home_team_name'].isin(valid_teams)) & (data['away_team_name'].isin(valid_teams))]

# Saving dataset

In [7]:
def run_full_process(long_strike, medium_strike, short_strike, columns_to_save, min_matches):
    """
    Executes the full data processing pipeline for a given season.
    
    Args:
        long_strike (int): Number of matches for long-term analysis.
        medium_strike (int): Number of matches for medium-term analysis.
        short_strike (int): Number of matches for short-term analysis.
        columns_to_save (list, optional): List of columns to save in the final dataset. If None, saves all columns.
    """
    # Step 1: Call the process_data() method to load, sort, and filter the data
    match_data_loader = MatchDataLoader()
    df = match_data_loader.get_dataframe()

    # Step 2: Initialize and run the Result_Strike_Stats class
    result_strike_stats = ResultStrikeStats(df, long_strike, medium_strike, short_strike)
    result_strike_stats.run()

    # Step 3: Initialize and run the Attendance_Strike_Stats class
    attendance_strike_stats = AttendanceStrikeStats(df, long_strike, medium_strike, short_strike)
    attendance_strike_stats.run()
    
    # Step 4: Select columns to save (if provided)
    if columns_to_save is not None:
        df = result_strike_stats.df[columns_to_save]
    else:
        df = result_strike_stats.df

    # Step 5: filter by min_matches
    df = FilterTeamsByMinMatches(df, min_matches)
    
    # Step 6: Save the updated dataset
    df.to_csv("../model_testing/final_dataset.csv", index=False)
    
    print("Final dataset saved successfully.")

In [None]:
# Data for the full process
long_strike = 19
medium_strike = 8
short_strike = 3
strikes = [long_strike, medium_strike, short_strike]

min_matches = 0

# Generate column names for the strikes
strike_columns = [
    f"avg_home_wins_last_{strike}" for strike in strikes
] + [
    f"avg_away_wins_last_{strike}" for strike in strikes
] + [
    f"avg_home_wins_last_{strike}_home" for strike in strikes
] + [
    f"avg_away_wins_last_{strike}_away" for strike in strikes
] + [
    f"avg_attendance_{strike}_home" for strike in strikes
] + [
    f"avg_attendance%_{strike}_home" for strike in strikes
] + [
    f"avg_attendance%_{strike}_away" for strike in strikes
]

# Combine the static columns with the dynamically generated ones
columns_to_save = [
    #"gameweek",
    "date_of_match",
    #"day_of_week",
    #"day_of_year",
    #"hour_of_day",
    "home_team_name",
    "away_team_name",
    #"home_trainer",
    #"away_trainer",
    #"stadium",
    #"referee",
    #"var",
] + strike_columns + [
    "home_team_consecutive_wins_global",
    "home_team_consecutive_losses_global",
    "away_team_consecutive_wins_global",
    "away_team_consecutive_losses_global",
    #"home_team_consecutive_wins_home",
    #"home_team_consecutive_losses_home",
    #"away_team_consecutive_wins_away",
    #"away_team_consecutive_losses_away",
    #"home_team_win_percentage_all", 
    #"home_team_win_percentage_home",
    #"away_team_win_percentage_all",
    #"away_team_win_percentage_away",
    "result",
    "home_team_rank",
    "away_team_rank",
    "home_team_points",
    "away_team_points",
    #"home_team_goals_for",
    #"away_team_goals_for",
    #"home_team_goals_againsts",
    #"away_team_goals_against",
    #"home_team_goals_difference",
    #"away_team_goals_difference",
    "prob_home_avg",
    "prob_draw_avg",
    "prob_away_avg"
]

# Run full process
run_full_process(long_strike, medium_strike, short_strike, columns_to_save, min_matches)

Final dataset saved successfully.


: 