In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# Model testing

In [10]:
# 📂 Cargar el CSV
csv_path = "final_dataset.csv"  # Ajusta la ruta si es necesario
csv = pd.read_csv(csv_path)

# Define your preprocessing pipeline for categorical and numerical features
categorical_features = ["home_team_name", "away_team_name"]
numerical_features = [
    "home_team_rank", "away_team_rank", "prob_home_avg", "prob_draw_avg", 
    "prob_away_avg", "home_team_points", "away_team_points",
    "home_team_consecutive_wins_global", "home_team_consecutive_losses_global",
    "away_team_consecutive_wins_global", "away_team_consecutive_losses_global"
]

# Custom function for encoding ranks
def encode_ranks(x):
    return np.column_stack([np.cos(2 * np.pi * x / 20), np.sin(2 * np.pi * x / 20)])

# Preprocessing step (handling both numerical and categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("home_team_rank", FunctionTransformer(lambda x: encode_ranks(x), validate=True), ["home_team_rank"]),
        ("away_team_rank", FunctionTransformer(lambda x: encode_ranks(x), validate=True), ["away_team_rank"]),
        ("prob_home_avg", SimpleImputer(strategy="mean"), ["prob_home_avg"]),
        ("prob_draw_avg", SimpleImputer(strategy="mean"), ["prob_draw_avg"]),
        ("prob_away_avg", SimpleImputer(strategy="mean"), ["prob_away_avg"]),
        ("home_team_points", SimpleImputer(strategy="mean"), ["home_team_points"]),
        ("away_team_points", SimpleImputer(strategy="mean"), ["away_team_points"]),
        ("home_team_consecutive_wins_global", SimpleImputer(strategy="mean"), ["home_team_consecutive_wins_global"]),
        ("home_team_consecutive_losses_global", SimpleImputer(strategy="mean"), ["home_team_consecutive_losses_global"]),
        ("away_team_consecutive_wins_global", SimpleImputer(strategy="mean"), ["away_team_consecutive_wins_global"]),
        ("away_team_consecutive_losses_global", SimpleImputer(strategy="mean"), ["away_team_consecutive_losses_global"])
    ]
)

# Define the model pipeline including preprocessing and classification
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler(with_mean=True)),
    ("classifier", RandomForestClassifier())
])

# Use RFECV for feature selection
selector = RFECV(pipeline, step=1, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the RFECV selector
selector.fit(X, y)

# Get the selected features
selected_features = X.columns[selector.support_]
print(f"Selected features: {selected_features}")

# 🚀 Pipeline con modelo optimizado
model = RandomForestClassifier(random_state=42)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])

# 🔍 Espacio de hiperparámetros para RandomizedSearchCV
param_dist = {
    "initial_train_size": list(range(380, len(X) - 10, 10)),  # Optimizamos tamaño del train
    "model__n_estimators": np.arange(50, 500, 50),
    "model__max_depth": [None, 10, 20, 30, 40, 50],
    "model__min_samples_split": np.arange(2, 20, 2),
    "model__min_samples_leaf": np.arange(1, 20, 2),
    "model__criterion": ["gini", "entropy"],
}

# ⏳ Validación temporal
tscv = TimeSeriesSplit(n_splits=5)

# 🏆 RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,  # Prueba 20 combinaciones aleatorias
    scoring="accuracy",
    cv=tscv,  # TimeSeriesSplit para datos temporales
    random_state=42,
    n_jobs=-1
)

# 🚀 Entrenar
random_search.fit(X, y)

# 📊 Resultados
print("Mejores parámetros:", random_search.best_params_)
print("Mejor accuracy en validación:", random_search.best_score_)

ValueError: could not convert string to float: 'Leganes'

# Results

In [3]:
csv_path = "final_dataset.csv"

skip_rows = 0

random_state = 0

X_new = pd.DataFrame({
    "home_team_name": ["Celta_Vigo", "Alaves", "Rayo_Vallecano", "Valencia", "Las_Palmas", 
                       "Athletic_Club", "Real_Madrid", "Getafe", "Real_Sociedad", "Sevilla"],
    "away_team_name": ["Osasuna", "Espanyol", "Villarreal", "Atletico_Madrid", "Barcelona", 
                       "Valladolid", "Girona", "Real_Betis", "Leganes", "Mallorca"],
    "home_team_rank": [14, 19, 6, 18, 17, 4, 2, 13, 11, 12],
    "away_team_rank": [9, 15, 5, 3, 1, 20, 10, 8, 16, 7],
    "home_team_points": [29.0, 22.0, 35.0, 23.0, 23.0, 45.0, 51.0, 30.0, 31.0, 31.0],
    "away_team_points": [32.0, 24.0, 41.0, 50.0, 51.0, 15.0, 31.0, 32.0, 24.0, 34.0],
    "odds_home": [1.9, 1.83, 2.85, 4.33, 9.5, 1.2, 1.31, 2.5, 1.57, 2.05], 
    "odds_draw": [3.5, 3.3, 3.6, 3.4, 6.25, 6.75, 5.75, 3.1, 3.7, 3.2], 
    "odds_away": [4.1, 4.75, 2.35, 1.88, 1.28, 15.5, 8.75, 3.0, 7.0, 3.9],
    "home_team_consecutive_wins_global": [0, 0, 0, 0, 0, 0, 0, 2, 0, 1],
    "home_team_consecutive_losses_global": [0, 0, 1, 0, 3, 0, 0, 0, 1, 0],
    "away_team_consecutive_wins_global": [0, 0, 0, 0, 4, 0, 0, 2, 0, 1],
    "away_team_consecutive_losses_global": [0, 0, 0, 0, 0, 5, 2, 0, 0, 0]
})

pipeline = ModelPipeline(csv_path, skip_rows, random_state, X_new)

pipeline.train()

#pipeline.calculate_bets()


🔹 Iniciando Walk-Forward Validation...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds 

In [4]:
'''if last_split:
                for result in [1, 0, -1]:
                    train_mask = self.y.iloc[last_split['train_index']] == result
                    test_mask = self.y.iloc[last_split['test_index']] == result
                    
                    train_score = self.model.score(self.X.iloc[last_split['train_index']][train_mask], self.y.iloc[last_split['train_index']][train_mask]) if train_mask.sum() > 0 else None
                    test_score = self.model.score(self.X.iloc[last_split['test_index']][test_mask], self.y.iloc[last_split['test_index']][test_mask]) if test_mask.sum() > 0 else None
                    
                    print(f"  📊 Train Score (Result {result}): {train_score:.4f}" if train_score is not None else f"  📊 Train Score (Result {result}): N/A")
                    print(f"  📊 Test Score (Result {result}): {test_score:.4f}" if test_score is not None else f"  📊 Test Score (Result {result}): N/A")
                
                teams = pd.concat([self.df.iloc[last_split['test_index']]["home_team_name"], self.df.iloc[last_split['test_index']]["away_team_name"]]).unique()
                for team in teams:
                    print(f"\n🔹 Team: {team} (Split {last_split['split']})")
                    
                    team_home_mask_train = self.df.iloc[last_split['train_index']]["home_team_name"] == team
                    team_home_mask_test = self.df.iloc[last_split['test_index']]["home_team_name"] == team
                    team_away_mask_train = self.df.iloc[last_split['train_index']]["away_team_name"] == team
                    team_away_mask_test = self.df.iloc[last_split['test_index']]["away_team_name"] == team
                    
                    for result in [1, 0, -1]:
                        home_train_mask = team_home_mask_train & (self.df.iloc[last_split['train_index']]["result"] == result)
                        home_test_mask = team_home_mask_test & (self.df.iloc[last_split['test_index']]["result"] == result)
                        away_train_mask = team_away_mask_train & (self.df.iloc[last_split['train_index']]["result"] == result)
                        away_test_mask = team_away_mask_test & (self.df.iloc[last_split['test_index']]["result"] == result)
                        
                        home_train_score = self.model.score(self.X.iloc[last_split['train_index']][home_train_mask], self.y.iloc[last_split['train_index']][home_train_mask]) if home_train_mask.sum() > 0 else None
                        home_test_score = self.model.score(self.X.iloc[last_split['test_index']][home_test_mask], self.y.iloc[last_split['test_index']][home_test_mask]) if home_test_mask.sum() > 0 else None
                        away_train_score = self.model.score(self.X.iloc[last_split['train_index']][away_train_mask], self.y.iloc[last_split['train_index']][away_train_mask]) if away_train_mask.sum() > 0 else None
                        away_test_score = self.model.score(self.X.iloc[last_split['test_index']][away_test_mask], self.y.iloc[last_split['test_index']][away_test_mask]) if away_test_mask.sum() > 0 else None
                        
                        print(f"  🏠 Home Train Score (Result {result}): {home_train_score:.4f}" if home_train_score is not None else f"  🏠 Home Train Score (Result {result}): N/A")
                        print(f"  🏠 Home Test Score (Result {result}): {home_test_score:.4f}" if home_test_score is not None else f"  🏠 Home Test Score (Result {result}): N/A")
                        print(f"  ✈️ Away Train Score (Result {result}): {away_train_score:.4f}" if away_train_score is not None else f"  ✈️ Away Train Score (Result {result}): N/A")
                        print(f"  ✈️ Away Test Score (Result {result}): {away_test_score:.4f}" if away_test_score is not None else f"  ✈️ Away Test Score (Result {result}): N/A")'''

'if last_split:\n                for result in [1, 0, -1]:\n                    train_mask = self.y.iloc[last_split[\'train_index\']] == result\n                    test_mask = self.y.iloc[last_split[\'test_index\']] == result\n                    \n                    train_score = self.model.score(self.X.iloc[last_split[\'train_index\']][train_mask], self.y.iloc[last_split[\'train_index\']][train_mask]) if train_mask.sum() > 0 else None\n                    test_score = self.model.score(self.X.iloc[last_split[\'test_index\']][test_mask], self.y.iloc[last_split[\'test_index\']][test_mask]) if test_mask.sum() > 0 else None\n                    \n                    print(f"  📊 Train Score (Result {result}): {train_score:.4f}" if train_score is not None else f"  📊 Train Score (Result {result}): N/A")\n                    print(f"  📊 Test Score (Result {result}): {test_score:.4f}" if test_score is not None else f"  📊 Test Score (Result {result}): N/A")\n                \n                

In [5]:
'''                            (
                                'day_of_week', FunctionTransformer(lambda x: np.column_stack([
                                    np.cos(2 * np.pi * x / 7),
                                    np.sin(2 * np.pi * x / 7)
                                    ]), validate=True), ["day_of_week"]
                            ),
                            (
                                "home_team_consecutive_wins_global",
                                SimpleImputer(strategy="mean"),
                                ["home_team_consecutive_wins_global"],
                            ),
                            (
                                "home_team_consecutive_losses_global",
                                SimpleImputer(strategy="mean"),
                                ["home_team_consecutive_losses_global"],
                            ),
                            (
                                "away_team_consecutive_wins_global",
                                SimpleImputer(strategy="mean"),
                                ["away_team_consecutive_wins_global"],
                            ),
                            (
                                "away_team_consecutive_losses_global",
                                SimpleImputer(strategy="mean"),
                                ["away_team_consecutive_losses_global"],
                            )'''

'                            (\n                                \'day_of_week\', FunctionTransformer(lambda x: np.column_stack([\n                                    np.cos(2 * np.pi * x / 7),\n                                    np.sin(2 * np.pi * x / 7)\n                                    ]), validate=True), ["day_of_week"]\n                            ),\n                            (\n                                "home_team_consecutive_wins_global",\n                                SimpleImputer(strategy="mean"),\n                                ["home_team_consecutive_wins_global"],\n                            ),\n                            (\n                                "home_team_consecutive_losses_global",\n                                SimpleImputer(strategy="mean"),\n                                ["home_team_consecutive_losses_global"],\n                            ),\n                            (\n                                "away_team_consecutive_wins_glo