In [12]:
import os
import subprocess
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

# Model testing

In [13]:
class ModelPipeline:
    def __init__(self, csv_path, skip_rows, random_state, X_new):
        self.csv_path = csv_path
        self.skip_rows = skip_rows
        self.df = self._load_data()
        self.X = self.df[
            [
                "home_team_name",
                "away_team_name",
                "home_team_rank",
                "away_team_rank",
                "prob_home_avg",
                "prob_draw_avg",
                "prob_away_avg",
                "home_team_points",
                "away_team_points",
                "home_team_consecutive_wins_global",
                "home_team_consecutive_losses_global",
                "away_team_consecutive_wins_global",
                "away_team_consecutive_losses_global"
            ]
        ]
        self.y = self.df["result"]
        self.random_state = random_state
        self.pipeline = self._create_pipeline()
        self.param_grid = self._create_param_grid()
        self.model = GridSearchCV(
            estimator=self.pipeline,
            param_grid=self.param_grid,
            scoring="accuracy",
            n_jobs=-1,
            verbose=4,
        )
        self.best_global_test_accuracy = None
        self.best_global_train_accuracy = None
        self.best_params = None
        self.best_split_name = None
        self.X_new = X_new
        self.probabilities_df = None

    def _load_data(self):
        df = pd.read_csv(self.csv_path)
        return df.iloc[self.skip_rows :].reset_index(drop=True)

    def _create_pipeline(self):
        return Pipeline(
            steps=[
                (
                    "CategoricalFeatures",
                    ColumnTransformer(
                        transformers=[
                            (
                                "cat",
                                OneHotEncoder(
                                    handle_unknown="ignore", sparse_output=False
                                ),
                                ["home_team_name", "away_team_name"],
                            ),
                            (
                                "home_team_rank",
                                FunctionTransformer(
                                    lambda x: np.column_stack(
                                        [
                                            np.cos(2 * np.pi * x / 20),
                                            np.sin(2 * np.pi * x / 20),
                                        ]
                                    ),
                                    validate=True,
                                ),
                                ["home_team_rank"],
                            ),
                            (
                                "away_team_rank",
                                FunctionTransformer(
                                    lambda x: np.column_stack(
                                        [
                                            np.cos(2 * np.pi * x / 20),
                                            np.sin(2 * np.pi * x / 20),
                                        ]
                                    ),
                                    validate=True,
                                ),
                                ["away_team_rank"],
                            ),
                            (
                                "prob_home_avg",
                                SimpleImputer(strategy="mean"),
                                ["prob_home_avg"],
                            ),
                            (
                                "prob_draw_avg",
                                SimpleImputer(strategy="mean"),
                                ["prob_draw_avg"],
                            ),
                            (
                                "prob_away_avg",
                                SimpleImputer(strategy="mean"),
                                ["prob_away_avg"],
                            ),
                            (
                                "home_team_points",
                                SimpleImputer(strategy="mean"),
                                ["home_team_points"],
                            ),
                            (
                                "away_team_points",
                                SimpleImputer(strategy="mean"),
                                ["away_team_points"],
                            ),
                            (
                                "home_team_consecutive_wins_global",
                                SimpleImputer(strategy="mean"),
                                ["home_team_consecutive_wins_global"],
                            ),
                            (
                                "home_team_consecutive_losses_global",
                                SimpleImputer(strategy="mean"),
                                ["home_team_consecutive_losses_global"],
                            ),
                            (
                                "away_team_consecutive_wins_global",
                                SimpleImputer(strategy="mean"),
                                ["away_team_consecutive_wins_global"],
                            ),
                            (
                                "away_team_consecutive_losses_global",
                                SimpleImputer(strategy="mean"),
                                ["away_team_consecutive_losses_global"],
                            )
                        ]
                    ),
                ),
                ("StandardScaler", StandardScaler(with_mean=True)),
                (
                    "SGDClassifier",
                    SGDClassifier(random_state=self.random_state),
                ),
            ]
        )

    def _create_param_grid(self):
        return {
            "SGDClassifier__tol": [
                1e-3
            ],
            "SGDClassifier__alpha": [
                #1e-1, 
                1e-2, 
                #1e-3
            ],
            "SGDClassifier__penalty": [
                #"l2", 
                "l1", 
                #"elasticnet"
            ],
            "SGDClassifier__loss": [
                "log_loss", 
                #"modified_huber"
            ],
            "SGDClassifier__max_iter": [3000],
            "SGDClassifier__learning_rate": [
                #"optimal", 
                "constant", 
                #"invscaling"
            ],
            "SGDClassifier__eta0": [
                #1e-3, 
                1e-4,
                #1e-5
            ]
        }

    def train(self):
        print("\n🔹 Iniciando Walk-Forward Validation...")

        initial_train_size = 380  # Empezamos con 380 partidos
        step_size = 10  # Avanzamos de 10 en 10 partidos

        n_steps = (len(self.X) - initial_train_size) // step_size

        test_accuracies = []
        last_train_accuracy = None

        for step in range(0, n_steps * step_size, step_size):
            train_end = initial_train_size + step
            train_index = list(range(train_end))
            test_index = list(range(train_end, min(train_end + step_size, len(self.X))))

            X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]

            self.model.fit(X_train, y_train)
            accuracy_train = self.model.score(X_train, y_train)
            accuracy_test = self.model.score(X_test, y_test)

            test_accuracies.append(accuracy_test)
            self.last_train_accuracy = accuracy_train  # Guardamos el último train accuracy

        # 🔹 Calculamos la media de test accuracy
        self.mean_test_accuracy = sum(test_accuracies) / len(test_accuracies)

        print("\n📊 **Resultados Finales:**")
        print(f"  🔹 Última Train Accuracy: {self.last_train_accuracy:.4f}")
        print(f"  🔹 Media Test Accuracy: {self.mean_test_accuracy:.4f}")

        self.log_to_mlflow()

    def log_to_mlflow(self):
        save_model = input("Do you want to save the model in MLflow? (yes/no): ").strip().lower()
        if save_model not in ["yes", "y"]:
            print("Model was not saved to MLflow.")
            return

        run_name = input("Enter the run name: ").strip()
        description = input("Enter the run description: ").strip()

        experiment_name = "BetPredictions"
        tracking_uri = os.path.abspath("mlruns")
        mlflow.set_tracking_uri(f"file:///{tracking_uri}")

        subprocess.Popen(f"mlflow ui --backend-store-uri file:///{tracking_uri}", shell=True)

        mlflow.set_experiment(experiment_name)

        with mlflow.start_run(run_name=run_name, description=description):
            mlflow.log_metric("Last Split Train Accuracy", self.last_train_accuracy)
            mlflow.log_metric("Last Split Test Accuracy", self.mean_test_accuracy)

            mlflow.log_param("features", self.X.columns.tolist())
            mlflow.log_param("best_split_name", self.best_split_name)

            if hasattr(self, 'best_params') and self.best_params:
                mlflow.log_params(self.best_params)

            print("Model, metrics, parameters, and features logged to MLflow.")

        open_mlflow = input("Do you want to open the MLflow UI page? (yes/no): ").strip().lower()
        if open_mlflow in ["yes", "y"]:
            experiment_id = "492606161886242227"  # Replace with your experiment ID
            mlflow_url = f"http://127.0.0.1:5000/#/experiments/{experiment_id}"
            subprocess.Popen(f"start {mlflow_url}", shell=True)

    def predict(self):
        """
        Realiza predicciones utilizando el modelo entrenado.
        
        X_new: DataFrame
            El conjunto de datos para realizar la predicción (con las mismas características que X durante el entrenamiento).
            
        Returns:
        y_pred: Array
            Las predicciones del modelo para los nuevos datos.
        """
        # Verifica si el modelo ya ha sido entrenado
        if not hasattr(self.model, 'best_estimator_'):
            raise Exception("Model has not been trained yet. Please train the model first.")
        
        # Accede al mejor modelo entrenado
        best_model = self.model.best_estimator_
        
        self.X_new["prob_home_avg"] = 1 / self.X_new["odds_home"]
        self.X_new["prob_draw_avg"] = 1 / self.X_new["odds_draw"]
        self.X_new["prob_away_avg"] = 1 / self.X_new["odds_away"]

        # Realiza las predicciones de probabilidad
        y_pred_proba = best_model.predict_proba(self.X_new)
        
        # Obtener el orden de las clases detectadas por el modelo
        class_order = best_model.classes_  # Esto devuelve algo como [-1, 0, 1]
        
        # Crear el DataFrame con las probabilidades, manteniendo el orden correcto
        probabilities_df = pd.DataFrame(y_pred_proba, columns=class_order)

        # Renombrar las columnas para mayor claridad
        probabilities_df = probabilities_df.rename(
            columns={-1: "prob_away_avg", 0: "prob_draw_avg", 1: "prob_home_avg"}
        )

        # Reordenar columnas para que sean ["Local Win", "Draw", "Away Win"]
        probabilities_df = probabilities_df[["prob_home_avg", "prob_draw_avg", "prob_away_avg"]]

        # Agregar los nombres de los equipos al DataFrame de salida
        probabilities_df.insert(0, "home_team_name", self.X_new["home_team_name"].values)
        probabilities_df.insert(1, "away_team_name", self.X_new["away_team_name"].values)
        probabilities_df.insert(5, "odds_home", self.X_new["odds_home"].values)
        probabilities_df.insert(6, "odds_draw", self.X_new["odds_draw"].values)
        probabilities_df.insert(7, "odds_away", self.X_new["odds_away"].values)

        print(probabilities_df)

        return probabilities_df
    
    def adjusted_prob(self, prob, prob_type):
        if prob_type == 'home':
            if prob > 0.67:
                return prob * 1.5  # Si la probabilidad de victoria local es mayor al 67%, multiplicamos por 1.5
            elif prob >= 0.50:
                return prob  # Si la probabilidad de victoria local está entre 50% y 67%, la dejamos igual
            else:
                return 0  # Si la probabilidad de victoria local es menor al 50%, no apostamos nada
        elif prob_type == 'away':
            if prob > 0.67:
                return prob  # Si la probabilidad de victoria visitante es mayor al 67%, la dejamos igual
            elif prob >= 0.50:
                return 0.5  # Si la probabilidad de victoria visitante está entre 50% y 67%, la ajustamos a 0.5
            else:
                return 0  # Si la probabilidad de victoria visitante es menor al 50%, no apostamos nada
        elif prob_type == 'draw':
            if prob > 0.67:
                return prob * 0.5  # Si la probabilidad de empate es mayor al 67%, la multiplicamos por 0.5
            elif prob >= 0.50:
                return prob * 0.25  # Si la probabilidad de empate está entre 50% y 67%, la multiplicamos por 0.25
            else:
                return 0  # Si la probabilidad de empate es menor al 50%, no apostamos nada

    def kelly_bet(self, prob, odds):
        return (odds * prob - (1 - prob)) / odds

    def best_kelly_bet(self, row):
        # Seleccionar el mayor porcentaje ajustado por cada tipo de resultado
        home_prob = self.adjusted_prob(row['prob_home_avg'], 'home')
        draw_prob = self.adjusted_prob(row['prob_draw_avg'], 'draw')
        away_prob = self.adjusted_prob(row['prob_away_avg'], 'away')
        
        # Encontrar el máximo de las probabilidades ajustadas
        max_prob = max(home_prob, draw_prob, away_prob)
        
        # Determinar cuál opción tiene el mayor porcentaje ajustado y calcular la apuesta de Kelly
        if max_prob == home_prob:
            return self.kelly_bet(home_prob, row['odds_home'])
        elif max_prob == draw_prob:
            return self.kelly_bet(draw_prob, row['odds_draw'])
        else:
            return self.kelly_bet(away_prob, row['odds_away'])

    def apply_bet_logic(self, row):
        # Si la probabilidad de cualquiera de las opciones es mayor al 50%, apostamos
        if row['prob_home_avg'] > 0.50 or row['prob_draw_avg'] > 0.50 or row['prob_away_avg'] > 0.50:
            # Si la probabilidad de victoria local es mayor al 50%, apostamos lo que diga Kelly
            if row['prob_home_avg'] > 0.50:
                return self.kelly_bet(self.adjusted_prob(row['prob_home_avg'], 'home'), row['odds_home'])
            # Si la probabilidad de victoria visitante es mayor al 50%, apostamos la mitad de lo que dice Kelly
            elif row['prob_away_avg'] > 0.50:
                return self.kelly_bet(self.adjusted_prob(row['prob_away_avg'], 'away'), row['odds_away']) / 2
            # Si la probabilidad de empate es mayor al 50%, apostamos un cuarto de lo que dice Kelly
            elif row['prob_draw_avg'] > 0.50:
                return self.kelly_bet(self.adjusted_prob(row['prob_draw_avg'], 'draw'), row['odds_draw']) / 4
        return 0  # Si no hay ninguna probabilidad mayor al 50%, no apostamos nada

    def calculate_bets(self):

        df = self.predict()

        # Aplicar la fórmula de Kelly con la lógica de apuestas
        df['bet_amount'] = df.apply(lambda row: self.apply_bet_logic(row), axis=1)

        # Normalizar las apuestas
        total_bet = df['bet_amount'].sum()
        df['bet_amount_norm'] = df['bet_amount'] / total_bet if total_bet > 0 else 0

        # Determinar la mejor opción de apuesta para cada partido
        df['best_bet'] = df.apply(lambda row: "Home" if row['bet_amount'] == self.kelly_bet(self.adjusted_prob(row['prob_home_avg'], 'home'), row['odds_home'])
                                            else ("Draw" if row['bet_amount'] == self.kelly_bet(self.adjusted_prob(row['prob_draw_avg'], 'draw'), row['odds_draw'])
                                            else "Away"), axis=1)

        # Modificar la columna 'bet_amount' para que no tenga valores superiores a 0.25
        df['bet_amount_fix'] = df.apply(lambda row: 
                                                  0.25 if row['best_bet'] == 'Home' and row['bet_amount_norm'] > 0.25 else
                                                  (0.25 if row['best_bet'] == 'Away' and row['bet_amount_norm'] > 0.2 else
                                                   (0.1 if row['best_bet'] == 'Draw' and row['bet_amount_norm'] > 0.1 else row['bet_amount_norm'])),
                                                  axis=1)

        # Mostrar el DataFrame con los resultados
        return df[df['bet_amount_norm'] > 0][["home_team_name", "away_team_name", "bet_amount_fix", "best_bet"]].round(2)

# Results

In [14]:
csv_path = "final_dataset.csv"

skip_rows = 0

random_state = 0

X_new = pd.DataFrame({
    "home_team_name": ["Celta_Vigo", "Alaves", "Rayo_Vallecano", "Valencia", "Las_Palmas", 
                       "Athletic_Club", "Real_Madrid", "Getafe", "Real_Sociedad", "Sevilla"],
    "away_team_name": ["Osasuna", "Espanyol", "Villarreal", "Atletico_Madrid", "Barcelona", 
                       "Valladolid", "Girona", "Real_Betis", "Leganes", "Mallorca"],
    "home_team_rank": [14, 19, 6, 18, 17, 4, 2, 13, 11, 12],
    "away_team_rank": [9, 15, 5, 3, 1, 20, 10, 8, 16, 7],
    "home_team_points": [29.0, 22.0, 35.0, 23.0, 23.0, 45.0, 51.0, 30.0, 31.0, 31.0],
    "away_team_points": [32.0, 24.0, 41.0, 50.0, 51.0, 15.0, 31.0, 32.0, 24.0, 34.0],
    "odds_home": [1.9, 1.83, 2.9, 4.33, 10.5, 1.2, 1.31, 2.5, 1.55, 2.05], 
    "odds_draw": [3.5, 3.3, 3.4, 3.4, 6.55, 6.75, 5.75, 3.1, 3.75, 3.2], 
    "odds_away": [4.1, 4.75, 2.37, 1.88, 1.25, 15.5, 8.75, 3.0, 6.75, 3.9],
    "home_team_consecutive_wins_global": [0, 0, 0, 0, 0, 0, 0, 2, 0, 1],
    "home_team_consecutive_losses_global": [0, 0, 1, 0, 3, 0, 0, 0, 1, 0],
    "away_team_consecutive_wins_global": [0, 0, 0, 0, 4, 0, 0, 2, 0, 1],
    "away_team_consecutive_losses_global": [0, 0, 0, 0, 0, 5, 2, 0, 0, 0]
})

pipeline = ModelPipeline(csv_path, skip_rows, random_state, X_new)

pipeline.train()

#pipeline.calculate_bets()


🔹 Iniciando Walk-Forward Validation...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds 

In [15]:
'''if last_split:
                for result in [1, 0, -1]:
                    train_mask = self.y.iloc[last_split['train_index']] == result
                    test_mask = self.y.iloc[last_split['test_index']] == result
                    
                    train_score = self.model.score(self.X.iloc[last_split['train_index']][train_mask], self.y.iloc[last_split['train_index']][train_mask]) if train_mask.sum() > 0 else None
                    test_score = self.model.score(self.X.iloc[last_split['test_index']][test_mask], self.y.iloc[last_split['test_index']][test_mask]) if test_mask.sum() > 0 else None
                    
                    print(f"  📊 Train Score (Result {result}): {train_score:.4f}" if train_score is not None else f"  📊 Train Score (Result {result}): N/A")
                    print(f"  📊 Test Score (Result {result}): {test_score:.4f}" if test_score is not None else f"  📊 Test Score (Result {result}): N/A")
                
                teams = pd.concat([self.df.iloc[last_split['test_index']]["home_team_name"], self.df.iloc[last_split['test_index']]["away_team_name"]]).unique()
                for team in teams:
                    print(f"\n🔹 Team: {team} (Split {last_split['split']})")
                    
                    team_home_mask_train = self.df.iloc[last_split['train_index']]["home_team_name"] == team
                    team_home_mask_test = self.df.iloc[last_split['test_index']]["home_team_name"] == team
                    team_away_mask_train = self.df.iloc[last_split['train_index']]["away_team_name"] == team
                    team_away_mask_test = self.df.iloc[last_split['test_index']]["away_team_name"] == team
                    
                    for result in [1, 0, -1]:
                        home_train_mask = team_home_mask_train & (self.df.iloc[last_split['train_index']]["result"] == result)
                        home_test_mask = team_home_mask_test & (self.df.iloc[last_split['test_index']]["result"] == result)
                        away_train_mask = team_away_mask_train & (self.df.iloc[last_split['train_index']]["result"] == result)
                        away_test_mask = team_away_mask_test & (self.df.iloc[last_split['test_index']]["result"] == result)
                        
                        home_train_score = self.model.score(self.X.iloc[last_split['train_index']][home_train_mask], self.y.iloc[last_split['train_index']][home_train_mask]) if home_train_mask.sum() > 0 else None
                        home_test_score = self.model.score(self.X.iloc[last_split['test_index']][home_test_mask], self.y.iloc[last_split['test_index']][home_test_mask]) if home_test_mask.sum() > 0 else None
                        away_train_score = self.model.score(self.X.iloc[last_split['train_index']][away_train_mask], self.y.iloc[last_split['train_index']][away_train_mask]) if away_train_mask.sum() > 0 else None
                        away_test_score = self.model.score(self.X.iloc[last_split['test_index']][away_test_mask], self.y.iloc[last_split['test_index']][away_test_mask]) if away_test_mask.sum() > 0 else None
                        
                        print(f"  🏠 Home Train Score (Result {result}): {home_train_score:.4f}" if home_train_score is not None else f"  🏠 Home Train Score (Result {result}): N/A")
                        print(f"  🏠 Home Test Score (Result {result}): {home_test_score:.4f}" if home_test_score is not None else f"  🏠 Home Test Score (Result {result}): N/A")
                        print(f"  ✈️ Away Train Score (Result {result}): {away_train_score:.4f}" if away_train_score is not None else f"  ✈️ Away Train Score (Result {result}): N/A")
                        print(f"  ✈️ Away Test Score (Result {result}): {away_test_score:.4f}" if away_test_score is not None else f"  ✈️ Away Test Score (Result {result}): N/A")'''

'if last_split:\n                for result in [1, 0, -1]:\n                    train_mask = self.y.iloc[last_split[\'train_index\']] == result\n                    test_mask = self.y.iloc[last_split[\'test_index\']] == result\n                    \n                    train_score = self.model.score(self.X.iloc[last_split[\'train_index\']][train_mask], self.y.iloc[last_split[\'train_index\']][train_mask]) if train_mask.sum() > 0 else None\n                    test_score = self.model.score(self.X.iloc[last_split[\'test_index\']][test_mask], self.y.iloc[last_split[\'test_index\']][test_mask]) if test_mask.sum() > 0 else None\n                    \n                    print(f"  📊 Train Score (Result {result}): {train_score:.4f}" if train_score is not None else f"  📊 Train Score (Result {result}): N/A")\n                    print(f"  📊 Test Score (Result {result}): {test_score:.4f}" if test_score is not None else f"  📊 Test Score (Result {result}): N/A")\n                \n                

In [16]:
'''                            (
                                'day_of_week', FunctionTransformer(lambda x: np.column_stack([
                                    np.cos(2 * np.pi * x / 7),
                                    np.sin(2 * np.pi * x / 7)
                                    ]), validate=True), ["day_of_week"]
                            ),
                            (
                                "home_team_consecutive_wins_global",
                                SimpleImputer(strategy="mean"),
                                ["home_team_consecutive_wins_global"],
                            ),
                            (
                                "home_team_consecutive_losses_global",
                                SimpleImputer(strategy="mean"),
                                ["home_team_consecutive_losses_global"],
                            ),
                            (
                                "away_team_consecutive_wins_global",
                                SimpleImputer(strategy="mean"),
                                ["away_team_consecutive_wins_global"],
                            ),
                            (
                                "away_team_consecutive_losses_global",
                                SimpleImputer(strategy="mean"),
                                ["away_team_consecutive_losses_global"],
                            )'''

'                            (\n                                \'day_of_week\', FunctionTransformer(lambda x: np.column_stack([\n                                    np.cos(2 * np.pi * x / 7),\n                                    np.sin(2 * np.pi * x / 7)\n                                    ]), validate=True), ["day_of_week"]\n                            ),\n                            (\n                                "home_team_consecutive_wins_global",\n                                SimpleImputer(strategy="mean"),\n                                ["home_team_consecutive_wins_global"],\n                            ),\n                            (\n                                "home_team_consecutive_losses_global",\n                                SimpleImputer(strategy="mean"),\n                                ["home_team_consecutive_losses_global"],\n                            ),\n                            (\n                                "away_team_consecutive_wins_glo