In [45]:
import os
import subprocess
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

# Model testing

In [48]:
class ModelPipeline:
    def __init__(self, csv_path, skip_rows, random_state):
        # Initialize the class with the file path, rows to skip, and random state for reproducibility
        self.csv_path = csv_path
        self.skip_rows = skip_rows
        self.df = self._load_data()  # Load the dataset
        self.X = self.df[[
            "home_team_name",
            "away_team_name",
            "home_team_rank",
            "away_team_rank",
            "day_of_week",
            "hour_of_day",
            "home_team_points",
            "away_team_points",
            "home_team_goals_for",
            "away_team_goals_for",
            "home_team_goals_againsts",
            "away_team_goals_against",
            "home_team_goals_difference",
            "away_team_goals_difference"
        ]]
        self.y = self.df["result"]  # Target variable: match result
        self.random_state = random_state
        self.pipeline = self._create_pipeline()  # Create the machine learning pipeline
        self.param_grid = self._create_param_grid()  # Define the hyperparameter grid for tuning
        self.model = GridSearchCV(
            estimator=self.pipeline,  # The model pipeline to tune
            param_grid=self.param_grid,  # The hyperparameters to search
            scoring="accuracy",  # Scoring method for evaluation
            cv=TimeSeriesSplit(n_splits=5),  # Cross-validation strategy for time series
            n_jobs=-1,  # Use all available CPU cores for parallel processing
            verbose=4,  # Verbosity level for detailed output during fitting
        )
        # Initialize accuracy tracking variables
        self.average_train_accuracy = None
        self.average_test_accuracy = None
        self.last_split_train_accuracy = None
        self.last_split_test_accuracy = None

    def _load_data(self):
        # Load the CSV data, skipping the specified number of rows, and reset the index
        df = pd.read_csv(self.csv_path)
        return df.iloc[self.skip_rows :].reset_index(drop=True)

    def _create_pipeline(self):
        # Create a machine learning pipeline with the following steps:
        return Pipeline(
            steps=[
                (
                    "CategoricalFeatures",  # Step name for categorical feature processing
                    ColumnTransformer(
                        transformers=[
                            (
                                "cat",  # Transformer name
                                OneHotEncoder(handle_unknown="ignore", sparse_output=False),  # One-hot encoding for categorical features
                                ["home_team_name", "away_team_name"],  # Columns to apply one-hot encoding
                            ),
                            (
                                'home_team_rank', FunctionTransformer(lambda x: np.column_stack([
                                    np.cos(2 * np.pi * x / 20),
                                    np.sin(2 * np.pi * x / 20)
                                    ]), validate=True), ["home_team_rank"]
                            ),
                            (
                                'away_team_rank', FunctionTransformer(lambda x: np.column_stack([
                                    np.cos(2 * np.pi * x / 20),
                                    np.sin(2 * np.pi * x / 20)
                                    ]), validate=True), ["away_team_rank"]
                            ),
                            (
                                'day_of_week', FunctionTransformer(lambda x: np.column_stack([
                                    np.cos(2 * np.pi * x / 7),
                                    np.sin(2 * np.pi * x / 7)
                                    ]), validate=True), ["day_of_week"]
                            ),
                            (
                                'hour_of_day', FunctionTransformer(lambda x: np.column_stack([
                                    np.cos(2 * np.pi * x / 24),
                                    np.sin(2 * np.pi * x / 24)
                                    ]), validate=True), ["hour_of_day"]
                            ),
                            ('home_team_points', SimpleImputer(strategy='mean'), ['home_team_points']),
                            ('away_team_points', SimpleImputer(strategy='mean'), ['away_team_points']),
                            ('home_team_goals_for', SimpleImputer(strategy='mean'), ['home_team_goals_for']),
                            ('away_team_goals_for', SimpleImputer(strategy='mean'), ['away_team_goals_for']),
                            ('home_team_goals_againsts', SimpleImputer(strategy='mean'), ['home_team_goals_againsts']),
                            ('away_team_goals_against', SimpleImputer(strategy='mean'), ['away_team_goals_against']),
                            ('home_team_goals_difference', SimpleImputer(strategy='mean'), ['home_team_goals_difference']),
                            ('away_team_goals_difference', SimpleImputer(strategy='mean'), ['away_team_goals_difference'])
                        ]
                    ),
                ),
                ("StandardScaler", StandardScaler(with_mean=True)),  # Standardize the features
                ("SGDClassifier", SGDClassifier(random_state=self.random_state, max_iter=1000)),  # SGD classifier for classification
            ]
        )

    def _create_param_grid(self):
        # Define the hyperparameter grid for tuning the SGDClassifier
        return {
            "SGDClassifier__tol": [1e-2, 1e-3, 1e-4],  # Tolerance for stopping criterion
            "SGDClassifier__alpha": [1e-3, 1e-4, 1e-5, 1e-6, 1e-7],  # Regularization strength
            "SGDClassifier__penalty": ["l2", "l1", "elasticnet"],  # Regularization type
            "SGDClassifier__loss": ["hinge", "log_loss", "modified_huber"],  # Loss function
        }

    def train(self):
        # Train the model using time series cross-validation
        tscv = TimeSeriesSplit(n_splits=5)  # Split data for time series cross-validation
        scores = []  # List to store accuracy scores for each fold

        # Iterate over the splits
        for split, (train_index, test_index) in enumerate(tscv.split(self.X), 1):
            # Split data into training and test sets
            X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
            self.model.fit(X_train, y_train)  # Fit the model to the training data
            accuracy_train = self.model.score(X_train, y_train)  # Calculate training accuracy
            accuracy_test = self.model.score(X_test, y_test)  # Calculate test accuracy
            scores.append((accuracy_train, accuracy_test))  # Store the scores for this split

        # Calculate the average accuracy across all splits
        self.average_train_accuracy = sum([score[0] for score in scores]) / len(scores)
        self.average_test_accuracy = sum([score[1] for score in scores]) / len(scores)
        # Store the accuracy for the last split
        self.last_split_train_accuracy = scores[-1][0]
        self.last_split_test_accuracy = scores[-1][1]

        # Print the accuracy results
        print(f"Average Train Accuracy: {self.average_train_accuracy:.4f}")
        print(f"Average Test Accuracy: {self.average_test_accuracy:.4f}")
        print(f"Last Split Train Accuracy: {self.last_split_train_accuracy:.4f}")
        print(f"Last Split Test Accuracy: {self.last_split_test_accuracy:.4f}")

        # Verificar si el modelo tiene coef_
        if hasattr(self.model, "coef_"):
            feature_importance = np.abs(self.model.coef_).mean(axis=0)  # Promediar si es multi-clase
            feature_importance_df = pd.DataFrame({
                'Feature': self.X.columns,
                'Importance': feature_importance
            }).sort_values(by='Importance', ascending=False)

            print("\nFeature Importance:")
            print(feature_importance_df)

            self.feature_importance_ = feature_importance_df  # Guardar para acceso posterior
        else:
            print("Feature importance not available for this model.")

        # Call the function to log the model to MLflow
        self.log_to_mlflow()

    def log_to_mlflow(self):
        # Ask if the user wants to save the model to MLflow
        save_model = input("Do you want to save the model in MLflow? (yes/no): ").strip().lower()
        if save_model not in ["yes", "y"]:
            print("Model was not saved to MLflow.")
            return  # Exit the function if the model is not to be saved

        # Ask for the run name and description
        run_name = input("Enter the run name: ").strip()
        description = input("Enter the run description: ").strip()

        # Set the experiment name and tracking URI for MLflow
        experiment_name = "BetPredictions"
        tracking_uri = os.path.abspath("mlruns")
        mlflow.set_tracking_uri(f"file:///{tracking_uri}")

        # Start the MLflow UI if it's not already running
        subprocess.Popen(f"mlflow ui --backend-store-uri file:///{tracking_uri}", shell=True)

        mlflow.set_experiment(experiment_name)

        # Log the model, metrics, parameters, and features to MLflow
        with mlflow.start_run(run_name=run_name, description=description):
            # Log the accuracy metrics
            mlflow.log_metric("Average Train Accuracy", self.average_train_accuracy)
            mlflow.log_metric("Average Test Accuracy", self.average_test_accuracy)
            mlflow.log_metric("Last Split Train Accuracy", self.last_split_train_accuracy)
            mlflow.log_metric("Last Split Test Accuracy", self.last_split_test_accuracy)

            # Log the features used in the model
            features_used = self.X.columns.tolist()
            mlflow.log_param("features", features_used)

            # Log the best hyperparameters if available
            if hasattr(self.model, 'best_params_'):
                mlflow.log_params(self.model.best_params_)

            # Print a message indicating the model has been logged
            print("Model, metrics, parameters, and features logged to MLflow.")

        # Ask if the user wants to open the MLflow UI
        open_mlflow = input("Do you want to open the MLflow UI page? (yes/no): ").strip().lower()
        if open_mlflow in ["yes", "y"]:
            # Open the MLflow experiment page in the browser
            experiment_id = "492606161886242227"  # Replace with your experiment ID
            mlflow_url = f"http://127.0.0.1:5000/#/experiments/{experiment_id}"
            # For Windows, use the 'start' command
            subprocess.Popen(f"start {mlflow_url}", shell=True)

# Results

In [49]:
csv_path = "final_dataset.csv"

skip_rows = 0

random_state = 0

pipeline = ModelPipeline(csv_path, skip_rows, random_state)

pipeline.train()

Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Fitting 5 folds for each of 135 candidates, totalling 675 fits
Average Train Accuracy: 0.6877
Average Test Accuracy: 0.3889
Last Split Train Accuracy: 0.6085
Last Split Test Accuracy: 0.3953
Feature importance not available for this model.
Model was not saved to MLflow.


In [50]:
"""class ModelPipeline:
    def __init__(self, csv_path, skip_rows, random_state):
        self.csv_path = csv_path
        self.skip_rows = skip_rows
        self.df = self._load_data()
        self.X = self.df.drop(columns=['target'])
        self.y = self.df['target']
        self.random_state = random_state
        self.pipeline = self._create_pipeline()
        self.param_grid = self._create_param_grid()
        self.model = GridSearchCV(
            estimator=self.pipeline,
            param_grid=self.param_grid,
            scoring='f1_weighted',
            cv=TimeSeriesSplit(n_splits=5),
            n_jobs=-1,
            verbose=4
        )

    def _load_data(self):
        df = pd.read_csv(self.csv_path)
        return df.iloc[self.skip_rows:].reset_index(drop=True)

    def _create_pipeline(self):
        return Pipeline(
            steps=[
                ('CategoricalFeatures', ColumnTransformer(
                    transformers=[
                        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), [
                            "home_team_name",
                            "away_team_name",
                            "referee",
                            "var",
                            "stadium",
                        ]),
                        ('gameweek', FunctionTransformer(lambda x: np.column_stack([
                            np.cos(2 * np.pi * x / 38),
                            np.sin(2 * np.pi * x / 38)
                        ]), validate=True), ["gameweek"]),
                        ('hour_of_the_day', FunctionTransformer(lambda x: np.column_stack([
                            np.cos(2 * np.pi * x / 24),
                            np.sin(2 * np.pi * x / 24)
                        ]), validate=True), ["hour_of_the_day"]),
                        ('avg_home_wins_last_10', SimpleImputer(strategy='mean'), ['avg_home_wins_last_10']),
                        ('avg_away_wins_last_10', SimpleImputer(strategy='mean'), ['avg_away_wins_last_10']),
                        ('avg_home_wins_last_5_home', SimpleImputer(strategy='mean'), ['avg_home_wins_last_3_home']),
                        ('avg_away_wins_last_5_away', SimpleImputer(strategy='mean'), ['avg_away_wins_last_3_away']),
                        ('avg_home_attendance_perc_last_10', SimpleImputer(strategy='mean'), ['avg_home_attendance_perc_last_10']),
                        ('avg_home_attendance_last_3', SimpleImputer(strategy='mean'), ['avg_home_attendance_last_3']),
                        ('avg_home_attendance_perc_last_3', SimpleImputer(strategy='mean'), ['avg_home_attendance_perc_last_3']),
                        ('avg_away_attendance_perc_last_10', SimpleImputer(strategy='mean'), ['avg_away_attendance_perc_last_10']),
                        ('avg_away_attendance_perc_last_3', SimpleImputer(strategy='mean'), ['avg_away_attendance_perc_last_3'])
                    ]
                )),
                ("StandardScaler", StandardScaler(with_mean=True)),
                ("SGDClassifier", SGDClassifier(
                    random_state=self.random_state,
                    max_iter=1000
                ))
            ]
        )

    def _create_param_grid(self):
        return {
            "SGDClassifier__tol": [1e-2, 1e-3, 1e-4],
            "SGDClassifier__alpha": [1e-3, 1e-4, 1e-5, 1e-6, 1e-7],
            "SGDClassifier__penalty": ['l2', 'l1', 'elasticnet'],
            "SGDClassifier__loss": ['hinge', 'log_loss', 'modified_huber']
        }

    def train(self):
        tscv = TimeSeriesSplit(n_splits=5)
        scores = []
        f1_scores_train = []
        f1_scores_test = []

        for split, (train_index, test_index) in enumerate(tscv.split(self.X), 1):
            X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
            self.model.fit(X_train, y_train)
            accuracy_train = self.model.score(X_train, y_train)
            accuracy_test = self.model.score(X_test, y_test)
            scores.append((accuracy_train, accuracy_test))
            y_pred_train = self.model.predict(X_train)
            y_pred_test = self.model.predict(X_test)
            f1_train = f1_score(y_train, y_pred_train, average='weighted')
            f1_test = f1_score(y_test, y_pred_test, average='weighted')
            f1_scores_train.append(f1_train)
            f1_scores_test.append(f1_test)

        self.plot_splits(tscv)

        average_train_accuracy = sum([score[0] for score in scores]) / len(scores)
        average_test_accuracy = sum([score[1] for score in scores]) / len(scores)
        average_train_f1 = sum(f1_scores_train) / len(f1_scores_train)
        average_test_f1 = sum(f1_scores_test) / len(f1_scores_test)

        print(f"Average Train Accuracy: {average_train_accuracy:.4f}")
        print(f"Average Test Accuracy: {average_test_accuracy:.4f}")
        print(f"Average Train F1-Score: {average_train_f1:.4f}")
        print(f"Average Test F1-Score: {average_test_f1:.4f}")

    def plot_splits(self, tscv):
        plt.figure(figsize=(12, 6))
        for i, (train_index, test_index) in enumerate(tscv.split(self.X)):
            plt.plot(train_index, [i + 1] * len(train_index), '|', color='blue', label='Train Set' if i == 0 else "", markersize=10)
            plt.plot(test_index, [i + 1] * len(test_index), '|', color='orange', label='Test Set' if i == 0 else "", markersize=10)
        plt.title("División Temporal: Train vs Test")
        plt.xlabel("Índice")
        plt.ylabel("División")
        plt.legend()
        plt.grid(axis='x', linestyle='--', alpha=0.7)
        plt.show()

    def log_to_mlflow(self):
        experiment_name = "BetPredictions"
        mlflow.set_experiment(experiment_name)

        save_results = input("Do you want to save results to MLflow? (yes/no): ").strip().lower()
        if save_results == 'yes':
            run_name = input("Enter run name: ").strip()
            description = input("Enter run description: ").strip()

            with mlflow.start_run(run_name=run_name, description=description):
                if hasattr(self.model, 'best_params_'):
                    mlflow.log_params(self.model.best_params_)

                features_used = self.X.columns.tolist()
                mlflow.log_param("features", features_used)

                if hasattr(self.model, 'best_estimator_'):
                    mlflow.sklearn.log_model(self.model.best_estimator_, "best_model_pipeline")

                print("Model, metrics, parameters, and features logged to MLflow.")"""

'class ModelPipeline:\n    def __init__(self, csv_path, skip_rows, random_state):\n        self.csv_path = csv_path\n        self.skip_rows = skip_rows\n        self.df = self._load_data()\n        self.X = self.df.drop(columns=[\'target\'])\n        self.y = self.df[\'target\']\n        self.random_state = random_state\n        self.pipeline = self._create_pipeline()\n        self.param_grid = self._create_param_grid()\n        self.model = GridSearchCV(\n            estimator=self.pipeline,\n            param_grid=self.param_grid,\n            scoring=\'f1_weighted\',\n            cv=TimeSeriesSplit(n_splits=5),\n            n_jobs=-1,\n            verbose=4\n        )\n\n    def _load_data(self):\n        df = pd.read_csv(self.csv_path)\n        return df.iloc[self.skip_rows:].reset_index(drop=True)\n\n    def _create_pipeline(self):\n        return Pipeline(\n            steps=[\n                (\'CategoricalFeatures\', ColumnTransformer(\n                    transformers=[\n     