In [27]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import random
import mlflow
import os
import subprocess
import time

# Model testing

In [28]:
class ModelPipeline:
    def __init__(self, csv_path, categorical_features, circular_max_values, target, random_state):
        self.csv_path = csv_path
        self.categorical_features = categorical_features
        self.circular_max_values = circular_max_values
        self.target = target
        self.random_state = random_state
        self.best_params = None
        self.best_split = None
        self.best_features = None
        self.best_accuracy = None
        self.best_description = None
        self.description = None
        self.number_experiment = 1

    def _load_data(self):
        data = pd.read_csv("final_dataset.csv")

        categorical_features = self.categorical_features

        label_encoders = {col: LabelEncoder() for col in categorical_features}
        for col in categorical_features:
            if col in data.columns:
                data[col] = label_encoders[col].fit_transform(data[col])

        circular_max_values = self.circular_max_values
        for col, max_value in circular_max_values.items():
            if col in data.columns:
                data[f"{col}_sin"] = np.sin(2 * np.pi * data[col] / max_value)
                data[f"{col}_cos"] = np.cos(2 * np.pi * data[col] / max_value)
                data.drop(columns=[col], inplace=True)

        return data

    def fetch_latest_mlflow_run(self):
        experiment_name = "BetPredictions"
        tracking_uri = os.path.abspath("mlruns")
        mlflow.set_tracking_uri(f"file:///{tracking_uri}")

        client = mlflow.tracking.MlflowClient()

        experiment = client.get_experiment_by_name(experiment_name)

        if experiment is None:
            print("No experiment found with name:", experiment_name)
            return None, None, None

        runs = client.search_runs(experiment.experiment_id, order_by=["start_time desc"], max_results=1)

        if not runs:
            print("No runs found in experiment:", experiment_name)
            return None, None, None

        latest_run = runs[0]

        run_name = latest_run.data.tags.get('mlflow.runName', 'No name available')

        if "RandomForest_" in run_name:
            try:
                self.number_experiment = int(run_name.split("RandomForest_")[1])
            except ValueError:
                print(f"Could not extract number from run_name: {run_name}")
                self.number_experiment = None
        else:
            print(f"Run name does not contain 'RandomForest_' pattern.")

        self.best_params = latest_run.data.params

        self.best_split = self.best_params.get("Best split number")
        if isinstance(self.best_split, str) and self.best_split.isdigit():
            self.best_split = int(self.best_split)
        else:
            self.best_split = None

        features_str = self.best_params.get("Features", "")
        self.best_features = features_str.split(',') if features_str else []

        self.best_accuracy = latest_run.data.metrics.get("Test Accuracy", None)
        if isinstance(self.best_accuracy, str):
            try:
                self.best_accuracy = float(self.best_accuracy)
            except ValueError:
                self.best_accuracy = None

        self.best_params.pop("Best split number", None)
        self.best_params.pop("Features", None)

        self.best_description = latest_run.data.tags.get('mlflow.note.content', 'No description available')

        if self.best_params.get("max_features") == "None":
            self.best_params["max_features"] = None

        for key, value in self.best_params.items():
            if isinstance(value, str) and value.isdigit():
                self.best_params[key] = int(value)

        mlflow_ui_process = subprocess.Popen(["mlflow", "ui"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        time.sleep(5)

        mlflow_url = f"http://127.0.0.1:5000/#/experiments/{experiment.experiment_id}"
        subprocess.Popen(f"start {mlflow_url}", shell=True)

    def train(self):

        data = self._load_data()

        self.fetch_latest_mlflow_run()

        target = self.target
        original_base_features = ["home_team_name", "away_team_name"]

        enriched_features = [col for col in data.columns if col != target]

        best_params = self.best_params
        best_split = self.best_split
        best_features = self.best_features
        best_accuracy = self.best_accuracy

        X = data[enriched_features]
        y = data[target]

        best_split = int(best_split)
        tscv = TimeSeriesSplit(best_split)

        scores = []
        for train_index, test_index in tscv.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model = RandomForestClassifier(**best_params, random_state=self.random_state)
            model.fit(X_train, y_train)

            y_test_pred = model.predict(X_test)
            test_acc = accuracy_score(y_test, y_test_pred)

            scores.append(test_acc)

        hyperparam_score = np.mean(scores)
        print(f"Best model until now. Params: {best_params} and {best_split} splits -> Accuracy: {hyperparam_score}.")
        print(f"Upgrade with best feature set: {best_features} with accuracy : {best_accuracy}")

        param_variations = {
            "n_estimators": sorted(set([best_params["n_estimators"]] + 
                                    [max(1, best_params["n_estimators"] + i) for i in range(-50, 101, 4)])),  
            "max_depth": sorted(set([best_params["max_depth"]] + 
                                    [max(1, best_params["max_depth"] + i) for i in range(-5, 6)])),  
            "min_samples_split": sorted(set([best_params["min_samples_split"]] + 
                                            [max(2, best_params["min_samples_split"] + i) for i in range(-5, 6)])),  
            "min_samples_leaf": sorted(set([best_params["min_samples_leaf"]] + 
                                        [max(1, best_params["min_samples_leaf"] + i) for i in range(-3, 4)])),  
            "max_features": ["sqrt", "log2", None],
            "criterion": ["gini", "entropy"]
        }

        n_splits_options = [best_split + i for i in range(-10, 11) if best_split + i > 1]

        test_hyperparams = None
        test_n_splits = None

        for feature in original_base_features:
            if feature in enriched_features:
                enriched_features.remove(feature)

        while True:
            base_features = original_base_features.copy()
            enriched_features = enriched_features.copy()
            train_features = enriched_features.copy()

            best_score = 0

            params = {
                "n_estimators": random.choice(param_variations["n_estimators"]),
                "max_depth": random.choice(param_variations["max_depth"]),
                "min_samples_split": random.choice(param_variations["min_samples_split"]),
                "min_samples_leaf": random.choice(param_variations["min_samples_leaf"]),
                "max_features": random.choice(param_variations["max_features"]),
                "criterion": random.choice(param_variations["criterion"])
            }

            n_splits = random.choice(n_splits_options)

            tscv = TimeSeriesSplit(n_splits)

            scores = []
            for train_index, test_index in tscv.split(X):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                model = RandomForestClassifier(**params, random_state=self.random_state)
                model.fit(X_train, y_train)

                y_test_pred = model.predict(X_test)
                test_acc = accuracy_score(y_test, y_test_pred)

                scores.append(test_acc)

            first_score = np.mean(scores)

            if first_score > hyperparam_score:
                test_hyperparams = params
                test_n_splits = n_splits
                print(f"New Best Found. Hyperparameters: {test_hyperparams} and {test_n_splits} splits with accuracy {first_score}")

                while train_features:
                    improved = False
                    current_best_score = best_score
                    best_feature_set = None

                    for feature in train_features:
                        current_features = base_features + [feature]

                        X_subset = data[current_features]
                        y_subset = data[target]

                        tscv = TimeSeriesSplit(n_splits=test_n_splits)
                        scores = []

                        for train_index, test_index in tscv.split(X_subset):
                            X_train, X_test = X_subset.iloc[train_index], X_subset.iloc[test_index]
                            y_train, y_test = y_subset.iloc[train_index], y_subset.iloc[test_index]

                            model = RandomForestClassifier(**test_hyperparams, random_state=self.random_state)
                            model.fit(X_train, y_train)

                            y_test_pred = model.predict(X_test)
                            test_acc = accuracy_score(y_test, y_test_pred)

                            scores.append(test_acc)

                        test_score = np.mean(scores)

                        if test_score > current_best_score:
                            current_best_score = test_score
                            best_feature_set = current_features.copy()
                            improved = True
                            print(f"Improved. New best feature set: {best_feature_set} with accuracy {test_score}")

                    if improved:
                        best_score = current_best_score
                        base_features = best_feature_set.copy()
                        train_features = [f for f in train_features if f not in best_feature_set]
                    else:
                        print(f"No improvement found. Best feature set remains: {base_features} with accuracy {best_score}.")
                        break

                if best_score > best_accuracy:
                    self.best_accuracy = best_score
                    self.best_params = test_hyperparams
                    self.best_split = test_n_splits
                    self.best_features = base_features.copy()

                    hyperparam_score = first_score

                    self.log_to_mlflow()

                    print(f"New Global Best Model Found! Hyperparameters: {test_hyperparams} and {test_n_splits} splits with accuracy {best_score}.")
                    self.feature_importance()

                    print(f"Saving and retrying with new parameters...")

                else:
                    print(f"Model discarded. Best accuracy still {best_accuracy}. Retrying with new parameters...")
    
    def feature_importance(self):

        data = self._load_data()

        X = data[self.best_features]
        y = data[self.target]

        final_model = RandomForestClassifier(**self.best_params, random_state=self.random_state)
        final_model.fit(X, y)

        importances = final_model.feature_importances_
        feature_importance = pd.DataFrame({'Feature': self.best_features, 'Importance': importances})
        feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

        plt.figure(figsize=(10, 6))
        plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='skyblue')
        plt.xlabel("Feature Importance")
        plt.ylabel("Feature")
        plt.title("Feature Importance using Final Global Best Model")
        plt.gca().invert_yaxis()
        plt.show()

    def log_to_mlflow(self):
        self.number_experiment += 1
        run_name = f"RandomForest_{self.number_experiment}"

        description = self.description if self.description is not None else self.best_description

        experiment_name = "BetPredictions"
        tracking_uri = os.path.abspath("mlruns")
        mlflow.set_tracking_uri(f"file:///{tracking_uri}")

        mlflow.set_experiment(experiment_name)

        with mlflow.start_run(run_name=run_name, description=description):
            if self.best_accuracy is not None:
                mlflow.log_metric("Test Accuracy", self.best_accuracy)
            
            if self.best_features:
                mlflow.log_param("Features", ",".join(self.best_features))
            
            if self.best_split is not None:
                mlflow.log_param("Best split number", int(self.best_split))
            
            if isinstance(self.best_params, dict):
                for param, value in self.best_params.items():
                    mlflow.log_param(param, value)

            print("Model, metrics, parameters, and features logged to MLflow.")

    def prepare_features(self):   
        decision2 = input("Please, tell me the gameweek you want to predict: ").strip()

        num_matches = int(input("Please, tell me how many matches you want me to predict: "))
        match_data = []

        for i in range(num_matches):
            print(f"\nMatch {i+1} data:")

            home_team = input("Enter the home team name: ").strip()
            away_team = input("Enter the away team name: ").strip()
            match_date = input("Enter the match date (YYYY-MM-DD): ").strip()
            match_time = input("Enter the match time (HH:MM): ").strip()
            referee = input("Enter the referee's name: ").strip()
            var = input("Enter the VAR's name: ").strip()

            coach_change_home = input(f"Has there been a change of coach for {home_team}? (yes/no): ").strip().lower()
            coach_change_away = input(f"Has there been a change of coach for {away_team}? (yes/no): ").strip().lower()

            if coach_change_home in ["yes", "y"]:
                home_coach = input(f"Enter the new coach for {home_team}: ").strip()
            else:
                home_coach = None

            if coach_change_away in ["yes", "y"]:
                away_coach = input(f"Enter the new coach for {away_team}: ").strip()
            else:
                away_coach = None

            match_date = pd.to_datetime(match_date, format="%Y-%m-%d", errors="coerce")
            
            day_of_week = match_date.dayofweek

            day_of_year = match_date.dayofyear

            match_time = pd.to_datetime(match_time, format="%H:%M", errors="coerce")
            
            hour_of_day = match_time.hour + match_time.minute / 60

            match_info = {
                "gameweek": decision2,
                "home_team_name": home_team,
                "away_team_name": away_team,
                "day_of_week": day_of_week,
                "day_of_year": day_of_year,
                "hour_of_day": hour_of_day,
                "referee": referee,
                "var": var,
                "home_trainer": home_coach if home_coach else None,   # Ver que hacer
                "away_trainer": away_coach if away_coach else None    # Ver que hacer
            }

            match_info = {key: value for key, value in match_info.items() if value is not None}

        match_info = {key: value for key, value in match_info.items() if value is not None}

        match_data_df = pd.DataFrame(match_data)

        self.fetch_latest_mlflow_run()

        features = self.best_features
        categorical_features = self.categorical_features
        circular_max_values = self.circular_max_values

        for index, row in match_data_df.iterrows():
            for feature in features:
                if feature in match_data_df.columns:
                    if feature in categorical_features:
                        label_encoder = LabelEncoder()
                        match_data_df.at[index, feature] = label_encoder.fit_transform([row[feature]])[0]

                    elif feature in circular_max_values:
                        max_value = circular_max_values[feature]
                        sin_value = np.sin(2 * np.pi * row[feature] / max_value)
                        cos_value = np.cos(2 * np.pi * row[feature] / max_value)
                        match_data_df.at[index, f"{feature}_sin"] = sin_value
                        match_data_df.at[index, f"{feature}_cos"] = cos_value
                        match_data_df.drop(columns=[feature], inplace=True)
                else:
                    # Si la característica no está en las columnas, la procesamos de la misma manera
                    if feature in categorical_features:
                        label_encoder = LabelEncoder()
                        match_data_df.at[index, feature] = label_encoder.fit_transform([row[feature]])[0]

                    elif feature in circular_max_values:
                        max_value = circular_max_values[feature]
                        sin_value = np.sin(2 * np.pi * row[feature] / max_value)
                        cos_value = np.cos(2 * np.pi * row[feature] / max_value)
                        match_data_df.at[index, f"{feature}_sin"] = sin_value
                        match_data_df.at[index, f"{feature}_cos"] = cos_value
                        match_data_df.drop(columns=[feature], inplace=True)

        # Return the processed DataFrame
        return data

    def predict(self):

        self.prepare_features()

        if self.model is None:
            print("Model is not loaded yet.")
            return None

        # Prepare the features for prediction
        prepared_data = self.prepare_features(match_data)

        # Make predictions (0 = Local Win, 1 = Draw, 2 = Away Win)
        predictions = self.model.predict(prepared_data)

        # Get prediction probabilities
        probabilities = self.model.predict_proba(prepared_data)

        # Prepare a list to store results
        results = []

        for i, prob in enumerate(probabilities):
            match_result = {
                "match_id": match_data.iloc[i]["match_id"],  # Assuming match_id is a column in your data
                "local_win_probability": prob[0] * 100,     # Probability of home team win
                "draw_probability": prob[1] * 100,          # Probability of draw
                "away_win_probability": prob[2] * 100       # Probability of away team win
            }
            results.append(match_result)

        return results

    def run(self):
        decision = input("Do you want to train the model, predict or open MLFlow? (train/predict/MLFlow): ").strip().lower()
        if decision in ["train", "tr"]:
            decision2 = input("Have you changed the data? (yes/no): ").strip().lower()
            if decision2 in ["yes", "y"]:
                decision3 = input("Have you introduced more features or more seasons? (features/seasons): ").strip().lower()
                if decision3 in ["features", "feature", "f"]:
                    self.description = input("Please, introduce a description for this new model: ")
                    self.train()
                elif decision3 in ["seasons", "season", "s"]:
                    self.description = input("Please, introduce a description for this new model: ")
                    self.train()
            elif decision2 in ["no", "n"]:
                self.train()
        elif decision in ["predict", "pr"]:
            self.predict()
        elif decision in ["MLFlow", "mlflow", "ml"]:
            self.fetch_latest_mlflow_run()

In [29]:
csv_path = "final_dataset.csv"

categorical_features = ["home_team_name", "away_team_name", "home_trainer", "away_trainer", "stadium", "referee", "var"]

circular_max_values = {"hour_of_day": 24, "day_of_week": 7, "gameweek": 38, "day_of_year": 365, "home_team_rank": 20, "away_team_rank": 20}

target = "result"

random_state = 42

pipeline = ModelPipeline(csv_path, categorical_features, circular_max_values, target, random_state)

pipeline.run()

Best model until now. Params: {'criterion': 'entropy', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 23, 'n_estimators': 64} and 2 splits -> Accuracy: 0.5268041237113402.
Upgrade with best feature set: ['home_team_name', 'away_team_name', 'prob_home_avg', 'home_AgeAvg_10', 'avg_home_wins_last_12', 'avg_home_wins_between_last_6', 'avg_away_wins_last_1', 'avg_attendance_1_home', 'avg_home_wins_last_6_home', 'away_team_consecutive_losses_global', 'away_team_consecutive_wins_global', 'avg_away_wins_last_1_away', 'avg_home_wins_between_last_4'] with accuracy : 0.5484536082474227
New Best Found. Hyperparameters: {'n_estimators': 94, 'max_depth': 3, 'min_samples_split': 20, 'min_samples_leaf': 3, 'max_features': None, 'criterion': 'entropy'} and 3 splits with accuracy 0.5272810637322329
Improved. New best feature set: ['home_team_name', 'away_team_name', 'home_trainer'] with accuracy 0.48693259972489683
Improved. New best feature set: ['home_team_name', 'aw


KeyboardInterrupt



In [None]:
'''best_params = {
    "n_estimators": 109,
    "max_depth": 4,
    "min_samples_split": 19,
    "min_samples_leaf": 1,
    "max_features": None,
    "criterion": "entropy"
}

best_split = 3

best_features = ["home_team_name", "away_team_name", "prob_away_avg", "prob_home_avg", "home_team_goals_difference", 
                    "home_AgeAvg_6", "avg_away_wins_last_4", "avg_home_wins_last_13", "avg_attendance_14_home", 
                    "avg_away_wins_last_19_away", "avg_attendance%_19_away", "away_AgeAvg_5", "avg_attendance%_9_home", 
                    "away_team_win_percentage_between", "referee", "away_AgeAvg_3", "avg_attendance_20_home", 
                    "avg_attendance_12_home", "avg_away_wins_last_5"
]

best_accuracy = 0.5469967904630902


number_experiment = 1
run_name = f"RandomForest_{number_experiment}"

description = "2017-2025"

experiment_name = "BetPredictions"
tracking_uri = os.path.abspath("mlruns")
mlflow.set_tracking_uri(f"file:///{tracking_uri}")

mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name=run_name, description=description):
    if best_accuracy is not None:
        mlflow.log_metric("Test Accuracy", best_accuracy)
    
    if best_features:
        mlflow.log_param("Features", ",".join(best_features))
    
    if best_split is not None:
        mlflow.log_param("Best split number", int(best_split))
    
    if isinstance(best_params, dict):
        for param, value in best_params.items():
            mlflow.log_param(param, value)

    print("Model, metrics, parameters, and features logged to MLflow.")'''

'best_params = {\n    "n_estimators": 109,\n    "max_depth": 4,\n    "min_samples_split": 19,\n    "min_samples_leaf": 1,\n    "max_features": None,\n    "criterion": "entropy"\n}\n\nbest_split = 3\n\nbest_features = ["home_team_name", "away_team_name", "prob_away_avg", "prob_home_avg", "home_team_goals_difference", \n                    "home_AgeAvg_6", "avg_away_wins_last_4", "avg_home_wins_last_13", "avg_attendance_14_home", \n                    "avg_away_wins_last_19_away", "avg_attendance%_19_away", "away_AgeAvg_5", "avg_attendance%_9_home", \n                    "away_team_win_percentage_between", "referee", "away_AgeAvg_3", "avg_attendance_20_home", \n                    "avg_attendance_12_home", "avg_away_wins_last_5"\n]\n\nbest_accuracy = 0.5469967904630902\n\n\nnumber_experiment = 1\nrun_name = f"RandomForest_{number_experiment}"\n\ndescription = "2017-2025"\n\nexperiment_name = "BetPredictions"\ntracking_uri = os.path.abspath("mlruns")\nmlflow.set_tracking_uri(f"file:///{tr

In [None]:
'''class MLFlowHandler:
    def __init__(self):
        self.number_experiment = None
        self.best_params = None
        self.best_split = None
        self.best_features = None
        self.best_accuracy = None
        self.best_description = None

    def fetch_latest_mlflow_run(self):
        experiment_name = "BetPredictions"
        tracking_uri = os.path.abspath("mlruns")
        mlflow.set_tracking_uri(f"file:///{tracking_uri}")

        client = mlflow.tracking.MlflowClient()
        
        experiment = client.get_experiment_by_name(experiment_name)

        if experiment is None:
            print("No experiment found with name:", experiment_name)
            return None, None, None

        runs = client.search_runs(experiment.experiment_id, order_by=["start_time desc"], max_results=1)

        if not runs:
            print("No runs found in experiment:", experiment_name)
            return None, None, None

        latest_run = runs[0]

        # Get the run name from tags
        run_name = latest_run.data.tags.get('mlflow.runName', 'No name available')
        print(f"Latest run name: {run_name}")

        # Extract the experiment number from the run name
        if "RandomForest_" in run_name:
            try:
                self.number_experiment = int(run_name.split("RandomForest_")[1])
            except ValueError:
                print(f"Could not extract number from run_name: {run_name}")
                self.number_experiment = None
        else:
            print(f"Run name does not contain 'RandomForest_' pattern.")

        # Extract best parameters from the params section
        self.best_params = latest_run.data.params
        
        # Separate best split and features from the params
        self.best_split = self.best_params.get("Best split number")
        if isinstance(self.best_split, str) and self.best_split.isdigit():
            self.best_split = int(self.best_split)
        else:
            self.best_split = None
        
        features_str = self.best_params.get("Features", "")
        self.best_features = features_str.split(',') if features_str else []

        # Extract best accuracy from metrics
        self.best_accuracy = latest_run.data.metrics.get("Test Accuracy", None)
        if isinstance(self.best_accuracy, str):
            try:
                self.best_accuracy = float(self.best_accuracy)
            except ValueError:
                self.best_accuracy = None

        self.best_params.pop("Best split number", None)
        self.best_params.pop("Features", None)
        print(f"Best parameters (all): {self.best_params}")

        # Extract description from the tags
        self.best_description = latest_run.data.tags.get('mlflow.note.content', 'No description available')
        print(f"Best description: {self.best_description}")

        print("Fetched latest run data.")

        # Return the components separately
        return self.best_split, self.best_features, self.best_params


# Llamar a la función
handler = MLFlowHandler()
best_split, best_features, best_params = handler.fetch_latest_mlflow_run()

# Imprimir los resultados separados
print("\nReturned components:")
print(f"Best split: {best_split}")
print(f"Best features: {best_features}")
print(f"Best parameters: {best_params}")

'''

'class MLFlowHandler:\n    def __init__(self):\n        self.number_experiment = None\n        self.best_params = None\n        self.best_split = None\n        self.best_features = None\n        self.best_accuracy = None\n        self.best_description = None\n\n    def fetch_latest_mlflow_run(self):\n        experiment_name = "BetPredictions"\n        tracking_uri = os.path.abspath("mlruns")\n        mlflow.set_tracking_uri(f"file:///{tracking_uri}")\n\n        client = mlflow.tracking.MlflowClient()\n        \n        experiment = client.get_experiment_by_name(experiment_name)\n\n        if experiment is None:\n            print("No experiment found with name:", experiment_name)\n            return None, None, None\n\n        runs = client.search_runs(experiment.experiment_id, order_by=["start_time desc"], max_results=1)\n\n        if not runs:\n            print("No runs found in experiment:", experiment_name)\n            return None, None, None\n\n        latest_run = runs[0]\n\n  