In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
import joblib
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
df_data_general = pd.read_csv('../../../data/data_general.csv')

df_data_general['INITIAL_TIME'] = pd.to_datetime(df_data_general['INITIAL_TIME'])
df_data_general['FINAL_TIME'] = pd.to_datetime(df_data_general['FINAL_TIME'])

df_data_general['INITIAL_TIME'] = df_data_general['INITIAL_TIME'].dt.to_period('D')
df_data_general['INITIAL_TIME'] = df_data_general['INITIAL_TIME'].dt.to_timestamp()


df_data_general['FINAL_TIME'] = df_data_general['FINAL_TIME'].dt.to_period('D')
df_data_general['FINAL_TIME'] = df_data_general['FINAL_TIME'].dt.to_timestamp()

df_data_general['Weekday']= df_data_general['INITIAL_TIME'].dt.strftime('%A')
df_data_general['number_of_day'] = df_data_general['INITIAL_TIME'].dt.day_of_week

df_data_general['TIME_ON_DEVICE_MIN'] = df_data_general['TIME_ON_DEVICE_SEC'] / 60

df_data_general['Hour'] = df_data_general['INITIAL_TIME'].dt.hour
df_data_general['Weekday'] = df_data_general['INITIAL_TIME'].dt.weekday   # 0=Lunes, 6=Domingo
df_data_general['Weekend'] = (df_data_general['Weekday'] >= 5).astype(int)
df_data_general['Month'] = df_data_general['INITIAL_TIME'].dt.month

df_data_general = df_data_general[df_data_general['TIME_ON_DEVICE_MIN'] < 600 ]

df_data_general = df_data_general[df_data_general['WIN_TOTAL'] > 0]
df_data_general['NET_SPEND'] = df_data_general['FINAL_AMOUNT'] - df_data_general['INITIAL_AMOUNT']
df_data_general = df_data_general[df_data_general['NET_SPEND'] < 10000 ]

In [3]:
df_data_general['CASINO_WON'] = df_data_general['NET_SPEND'] <  df_data_general['INITIAL_AMOUNT']

In [4]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.utils import resample
from tensorflow.keras.models import load_model
import warnings
warnings.filterwarnings('ignore')

class CasinoEnsembleModelOptimized:
    def __init__(self, model_paths):
        """
        Inicializa el modelo ensemble cargando los modelos pre-entrenados
        
        Args:
            model_paths (dict): Diccionario con las rutas de los modelos y scalers
        """
        self.model_paths = model_paths
        self.models = {}
        self.scalers = {}
        self.ensemble_scaler = StandardScaler()
        self.trained_classifiers = {}
        
        # Cargar modelos pre-entrenados
        self.load_pretrained_models()
    
    def load_pretrained_models(self):
        """Carga los modelos y scalers pre-entrenados"""
        try:
            # Cargar modelos de Keras
            self.models['time'] = load_model(self.model_paths['tiempo_model'])
            self.models['bet'] = load_model(self.model_paths['bet_model'])
            self.models['win'] = load_model(self.model_paths['win_model'])
            
            # Cargar scalers con joblib
            self.scalers['time'] = joblib.load(self.model_paths['tiempo_scaler'])
            self.scalers['bet'] = joblib.load(self.model_paths['bet_scaler'])
            self.scalers['win'] = joblib.load(self.model_paths['win_scaler'])
            
            print("✅ Todos los modelos y scalers cargados correctamente")
            
        except Exception as e:
            print(f"❌ Error cargando modelos: {e}")
            raise
    
    def create_business_features(self, base_data, tiempo_pred=None, bet_pred=None):
        """
        Crea las features de negocio necesarias para los modelos (IDÉNTICA a tu CasinoMLPModel)
        """
        features = base_data[['INITIAL_AMOUNT', 'AVG_BET', 'Cluster']].copy()
        
        if tiempo_pred is not None:
            features['tiempo_pred'] = tiempo_pred
            
        if bet_pred is not None:
            features['bet_pred'] = bet_pred
            features['total_money_handled'] = bet_pred
            features['house_edge_effect'] = bet_pred * 0.05
            features['money_multiplier'] = bet_pred / (base_data['INITIAL_AMOUNT'] + 1)
            features['reinvestment_indicator'] = np.where(bet_pred > base_data['INITIAL_AMOUNT'], 1, 0)
            features['excess_betting'] = np.maximum(0, bet_pred - base_data['INITIAL_AMOUNT'])
            features['money_at_risk'] = np.minimum(bet_pred, base_data['INITIAL_AMOUNT'])
            features['cluster_risk_adjusted'] = base_data['Cluster'] * features['money_multiplier']
            
        return features
    
    def predict_session(self, initial_amount, avg_bet, cluster, weekday=1, weekend=0, month=1):
        """
        Hacer predicción completa para una sesión usando los modelos MLP en secuencia
        """
        if not self.models:
            return {"error": "Modelos no cargados correctamente"}
        
        # Preparar datos base
        base_data = pd.DataFrame({
            'INITIAL_AMOUNT': [initial_amount],
            'AVG_BET': [avg_bet], 
            'Cluster': [cluster],
            'Weekday': [weekday],
            'Weekend': [weekend],
            'Month': [month]
        })
        
        try:
            results = {}
            
            # 1. Predecir TIEMPO
            if 'time' in self.models:
                try:
                    X_tiempo = self.create_business_features(base_data)
                    X_tiempo_scaled = self.scalers['time'].transform(X_tiempo)
                    tiempo_pred = float(self.models['time'].predict(X_tiempo_scaled, verbose=0)[0][0])
                    tiempo_pred = max(0, tiempo_pred)  # Asegurar valor positivo
                    results['time_on_device'] = tiempo_pred
                except Exception as e:
                    print(f"Error prediciendo tiempo: {e}")
                    tiempo_pred = 30.0
                    results['time_on_device'] = tiempo_pred
            else:
                tiempo_pred = 30.0
                results['time_on_device'] = tiempo_pred
            
            # 2. Predecir BET TOTAL
            if 'bet' in self.models:
                try:
                    X_bet = self.create_business_features(base_data, tiempo_pred=tiempo_pred)
                    X_bet_scaled = self.scalers['bet'].transform(X_bet)
                    bet_pred = float(self.models['bet'].predict(X_bet_scaled, verbose=0)[0][0])
                    bet_pred = max(0, bet_pred)  # Asegurar valor positivo
                    results['bet_total'] = bet_pred
                except Exception as e:
                    print(f"Error prediciendo bet: {e}")
                    bet_pred = initial_amount * 2
                    results['bet_total'] = bet_pred
            else:
                bet_pred = initial_amount * 2
                results['bet_total'] = bet_pred
            
            # 3. Predecir WIN TOTAL
            if 'win' in self.models:
                try:
                    X_win = self.create_business_features(base_data, tiempo_pred=tiempo_pred, bet_pred=bet_pred)
                    X_win_scaled = self.scalers['win'].transform(X_win)
                    win_pred = float(self.models['win'].predict(X_win_scaled, verbose=0)[0][0])
                    win_pred = max(0, win_pred)  # Asegurar valor positivo
                    results['win_total'] = win_pred
                except Exception as e:
                    print(f"Error prediciendo win: {e}")
                    win_pred = bet_pred * 0.95
                    results['win_total'] = win_pred
            else:
                win_pred = bet_pred * 0.95
                results['win_total'] = win_pred
            
            return results
            
        except Exception as e:
            return {"error": f"Error en predicción: {str(e)}"}
    
    def generate_ensemble_features_batch(self, input_data, batch_size=1000):
        """
        Versión OPTIMIZADA que procesa en lotes para mejor rendimiento
        """
        ensemble_features = []
        total_batches = len(input_data) // batch_size + (1 if len(input_data) % batch_size > 0 else 1)
        
        print(f"🔄 Procesando {len(input_data)} sesiones en {total_batches} lotes de {batch_size}...")
        
        for batch_idx in range(0, len(input_data), batch_size):
            batch_end = min(batch_idx + batch_size, len(input_data))
            batch_data = input_data.iloc[batch_idx:batch_end]
            
            current_batch = (batch_idx // batch_size) + 1
            print(f"📦 Procesando lote {current_batch}/{total_batches} ({len(batch_data)} sesiones)...")
            
            # Procesar tiempo en batch
            X_tiempo_batch = []
            for _, row in batch_data.iterrows():
                base_data = pd.DataFrame({
                    'INITIAL_AMOUNT': [row['INITIAL_AMOUNT']],
                    'AVG_BET': [row['AVG_BET']],
                    'Cluster': [row['Cluster']]
                })
                X_tiempo = self.create_business_features(base_data)
                X_tiempo_batch.append(X_tiempo.iloc[0].values)
            
            X_tiempo_batch = np.array(X_tiempo_batch)
            X_tiempo_scaled = self.scalers['time'].transform(X_tiempo_batch)
            tiempo_preds = self.models['time'].predict(X_tiempo_scaled, verbose=0).flatten()
            tiempo_preds = np.maximum(0, tiempo_preds)
            
            # Procesar bet en batch
            X_bet_batch = []
            for i, (_, row) in enumerate(batch_data.iterrows()):
                base_data = pd.DataFrame({
                    'INITIAL_AMOUNT': [row['INITIAL_AMOUNT']],
                    'AVG_BET': [row['AVG_BET']],
                    'Cluster': [row['Cluster']]
                })
                X_bet = self.create_business_features(base_data, tiempo_pred=tiempo_preds[i])
                X_bet_batch.append(X_bet.iloc[0].values)
            
            X_bet_batch = np.array(X_bet_batch)
            X_bet_scaled = self.scalers['bet'].transform(X_bet_batch)
            bet_preds = self.models['bet'].predict(X_bet_scaled, verbose=0).flatten()
            bet_preds = np.maximum(0, bet_preds)
            
            # Procesar win en batch
            X_win_batch = []
            for i, (_, row) in enumerate(batch_data.iterrows()):
                base_data = pd.DataFrame({
                    'INITIAL_AMOUNT': [row['INITIAL_AMOUNT']],
                    'AVG_BET': [row['AVG_BET']],
                    'Cluster': [row['Cluster']]
                })
                X_win = self.create_business_features(base_data, tiempo_pred=tiempo_preds[i], bet_pred=bet_preds[i])
                X_win_batch.append(X_win.iloc[0].values)
            
            X_win_batch = np.array(X_win_batch)
            X_win_scaled = self.scalers['win'].transform(X_win_batch)
            win_preds = self.models['win'].predict(X_win_scaled, verbose=0).flatten()
            win_preds = np.maximum(0, win_preds)
            
            # Crear features del ensemble para este batch
            for i, (_, row) in enumerate(batch_data.iterrows()):
                feature_vector = {
                    'time_on_device': tiempo_preds[i],
                    'bet_total': bet_preds[i],
                    'win_total': win_preds[i],
                    'initial_amount': row['INITIAL_AMOUNT'],
                    'cluster': row['Cluster'],
                    'avg_bet': row['AVG_BET']
                }
                ensemble_features.append(feature_vector)
            
            # Progreso
            progress = min(((batch_idx + batch_size) / len(input_data)) * 100, 100)
            print(f"✅ Lote {current_batch} completado. Progreso: {progress:.1f}%")
        
        result_df = pd.DataFrame(ensemble_features)
        
        print(f"✅ Features del ensemble generadas exitosamente:")
        print(f"   - {len(result_df)} sesiones procesadas")
        print(f"   - Variables generadas por modelos MLP: time_on_device, bet_total, win_total")
        print(f"   - Variables originales: initial_amount, cluster, avg_bet")
        print(f"   - Total features para clasificación: {result_df.shape[1]}")
        
        # Mostrar estadísticas de las predicciones generadas
        print(f"\n📊 Estadísticas de predicciones generadas:")
        print(f"   time_on_device: {result_df['time_on_device'].mean():.2f} ± {result_df['time_on_device'].std():.2f}")
        print(f"   bet_total: ${result_df['bet_total'].mean():.2f} ± ${result_df['bet_total'].std():.2f}")
        print(f"   win_total: ${result_df['win_total'].mean():.2f} ± ${result_df['win_total'].std():.2f}")
        
        return result_df
    
    def generate_ensemble_features(self, input_data):
        """
        Wrapper que usa la versión batch optimizada
        """
        return self.generate_ensemble_features_batch(input_data, batch_size=1000)
    
    def prepare_ensemble_features(self, data, target_col='CASINO_WON'):
        """
        Prepara las features para el modelo ensemble usando predicciones batch
        """
        # Verificar que tenemos las features base necesarias
        required_base_features = ['INITIAL_AMOUNT', 'AVG_BET', 'Cluster']
        missing_base = [f for f in required_base_features if f not in data.columns]
        if missing_base:
            raise ValueError(f"Features base faltantes para generar predicciones: {missing_base}")
        
        # Verificar que tenemos la columna objetivo
        if target_col not in data.columns:
            raise ValueError(f"Columna objetivo '{target_col}' no encontrada")
        
        print("🔄 Generando predicciones usando modelos individuales (procesamiento batch)...")
        X = self.generate_ensemble_features(data)
        y = data[target_col].iloc[:len(X)]  # Asegurar mismo length en caso de filas fallidas
        
        return X, y
    
    def train_ensemble_models_sampled(self, data, target_col='CASINO_WON', sample_size=10000, test_size=0.2):
        """
        Entrena usando una muestra estratificada para acelerar el proceso
        """
        print(f"🎯 Usando muestreo estratificado de {sample_size} sesiones de {len(data)} totales")
        
        # Separar por clase
        class_0 = data[data[target_col] == 0]
        class_1 = data[data[target_col] == 1]
        
        print(f"📊 Distribución original:")
        print(f"   Clase 0 (Jugador gana): {len(class_0)} ({len(class_0)/len(data)*100:.1f}%)")
        print(f"   Clase 1 (Casino gana): {len(class_1)} ({len(class_1)/len(data)*100:.1f}%)")
        
        # Calcular muestras por clase manteniendo la proporción
        proportion_class_1 = len(class_1) / len(data)
        n_class_1 = min(len(class_1), int(sample_size * proportion_class_1))
        n_class_0 = min(len(class_0), sample_size - n_class_1)
        
        # Ajustar si no hay suficientes datos de alguna clase
        if n_class_0 + n_class_1 < sample_size:
            print(f"⚠️ Ajustando tamaño de muestra a {n_class_0 + n_class_1} (datos disponibles)")
            sample_size = n_class_0 + n_class_1
        
        # Muestrear
        sampled_class_0 = resample(class_0, n_samples=n_class_0, random_state=42, replace=False)
        sampled_class_1 = resample(class_1, n_samples=n_class_1, random_state=42, replace=False)
        
        sampled_data = pd.concat([sampled_class_0, sampled_class_1]).reset_index(drop=True)
        
        print(f"📊 Muestra seleccionada: {len(sampled_data)} sesiones")
        print(f"   Distribución muestreada:")
        print(f"   Clase 0: {len(sampled_class_0)} ({len(sampled_class_0)/len(sampled_data)*100:.1f}%)")
        print(f"   Clase 1: {len(sampled_class_1)} ({len(sampled_class_1)/len(sampled_data)*100:.1f}%)")
        
        # Entrenar con la muestra
        return self.train_ensemble_models(sampled_data, target_col, test_size)
    
    def train_knn_model(self, X_train, y_train, X_test, y_test):
        """Entrena y evalúa modelo KNN"""
        print("\n🔹 Entrenando modelo KNN...")
        
        # Grid search para encontrar mejores parámetros
        param_grid = {
            'n_neighbors': [3, 5, 7, 9, 11],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
        
        knn = KNeighborsClassifier()
        grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_knn = grid_search.best_estimator_
        self.trained_classifiers['KNN'] = best_knn
        
        # Predicciones
        y_pred = best_knn.predict(X_test)
        y_proba = best_knn.predict_proba(X_test)[:, 1]
        
        # Métricas
        accuracy = accuracy_score(y_test, y_pred)
        auc_score = roc_auc_score(y_test, y_proba)
        
        print(f"Mejores parámetros: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"AUC-ROC: {auc_score:.4f}")
        print("\nReporte de clasificación:")
        print(classification_report(y_test, y_pred))
        
        return {
            'model': best_knn,
            'accuracy': accuracy,
            'auc_score': auc_score,
            'predictions': y_pred,
            'probabilities': y_proba
        }
    
    def train_random_forest_model(self, X_train, y_train, X_test, y_test):
        """Entrena y evalúa modelo Random Forest"""
        print("\n🌳 Entrenando modelo Random Forest...")
        
        # Parámetros para Random Forest
        param_grid = {
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }
        
        rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
        grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_rf = grid_search.best_estimator_
        self.trained_classifiers['RandomForest'] = best_rf
        
        # Predicciones
        y_pred = best_rf.predict(X_test)
        y_proba = best_rf.predict_proba(X_test)[:, 1]
        
        # Métricas
        accuracy = accuracy_score(y_test, y_pred)
        auc_score = roc_auc_score(y_test, y_proba)
        
        print(f"Mejores parámetros: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"AUC-ROC: {auc_score:.4f}")
        
        # Importancia de features
        feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': best_rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nImportancia de features:")
        print(feature_importance)
        print("\nReporte de clasificación:")
        print(classification_report(y_test, y_pred))
        
        return {
            'model': best_rf,
            'accuracy': accuracy,
            'auc_score': auc_score,
            'predictions': y_pred,
            'probabilities': y_proba,
            'feature_importance': feature_importance
        }
    
    def train_mlp_model(self, X_train, y_train, X_test, y_test):
        """Entrena y evalúa modelo MLP"""
        print("\n🧠 Entrenando modelo MLP...")
        
        # Parámetros para MLP
        param_grid = {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
            'activation': ['relu', 'tanh'],
            'learning_rate_init': [0.001, 0.01, 0.1],
            'alpha': [0.0001, 0.001, 0.01]
        }
        
        mlp = MLPClassifier(random_state=42, max_iter=1000)
        grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_mlp = grid_search.best_estimator_
        self.trained_classifiers['MLP'] = best_mlp
        
        # Predicciones
        y_pred = best_mlp.predict(X_test)
        y_proba = best_mlp.predict_proba(X_test)[:, 1]
        
        # Métricas
        accuracy = accuracy_score(y_test, y_pred)
        auc_score = roc_auc_score(y_test, y_proba)
        
        print(f"Mejores parámetros: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"AUC-ROC: {auc_score:.4f}")
        print("\nReporte de clasificación:")
        print(classification_report(y_test, y_pred))
        
        return {
            'model': best_mlp,
            'accuracy': accuracy,
            'auc_score': auc_score,
            'predictions': y_pred,
            'probabilities': y_proba
        }
    
    def train_ensemble_models(self, data, target_col='CASINO_WON', test_size=0.2):
        """
        Entrena todos los modelos ensemble usando las predicciones de los modelos individuales
        """
        print("🚀 Iniciando entrenamiento de modelos ensemble...")
        print("📝 Las features time_on_device, bet_total, win_total se generarán desde los modelos pre-entrenados")
        
        # Verificar columnas requeridas
        required_cols = ['INITIAL_AMOUNT', 'AVG_BET', 'Cluster', target_col]
        missing_cols = [col for col in required_cols if col not in data.columns]
        if missing_cols:
            raise ValueError(f"Columnas requeridas faltantes: {missing_cols}")
        
        print(f"📋 Columnas disponibles: {list(data.columns)}")
        print(f"📊 Dataset shape: {data.shape}")
        
        # Preparar features (genera predicciones desde modelos MLP usando batch processing)
        X, y = self.prepare_ensemble_features(data, target_col)
        
        print(f"📊 Datos preparados: {X.shape[0]} muestras, {X.shape[1]} features")
        print(f"📈 Distribución objetivo: {y.value_counts().to_dict()}")
        print(f"🎯 Features del ensemble: {list(X.columns)}")
        
        # Dividir datos
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )
        
        # Escalar features
        X_train_scaled = self.ensemble_scaler.fit_transform(X_train)
        X_test_scaled = self.ensemble_scaler.transform(X_test)
        
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
        
        # Entrenar modelos
        results = {}
        print("KNN Start")
        # KNN
        results['KNN'] = self.train_knn_model(X_train_scaled, y_train, X_test_scaled, y_test)
        print("KNN End")

        print("RF Start")
        # Random Forest (no necesita escalado, pero lo usaremos para consistencia)
        results['RandomForest'] = self.train_random_forest_model(X_train, y_train, X_test, y_test)
        print("RF Start")

        print("MLP Start")
        # MLP
        results['MLP'] = self.train_mlp_model(X_train_scaled, y_train, X_test_scaled, y_test)
        print("KNN MLP")
        # Resumen de resultados
        self.print_results_summary(results)
        
        return results
    
    def predict_casino_won(self, initial_amount, avg_bet, cluster, weekday=1, weekend=0, month=1, model_name='best'):
        """
        Predice si el casino ganará para una sesión específica
        """
        # Generar predicciones de los modelos individuales
        predictions = self.predict_session(initial_amount, avg_bet, cluster, weekday, weekend, month)
        
        if 'error' in predictions:
            return predictions
        
        # Crear feature vector para ensemble
        feature_vector = pd.DataFrame({
            'time_on_device': [predictions['time_on_device']],
            'bet_total': [predictions['bet_total']],
            'win_total': [predictions['win_total']],
            'initial_amount': [initial_amount],
            'cluster': [cluster],
            'avg_bet': [avg_bet]
        })
        
        # Escalar features
        feature_vector_scaled = self.ensemble_scaler.transform(feature_vector)
        feature_vector_scaled = pd.DataFrame(feature_vector_scaled, columns=feature_vector.columns)
        
        # Seleccionar modelo
        if model_name == 'best':
            # Aquí podrías implementar lógica para seleccionar el mejor modelo
            # Por ahora usaremos Random Forest como default
            model_name = 'RandomForest'
        
        if model_name not in self.trained_classifiers:
            return {"error": f"Modelo {model_name} no encontrado"}
        
        model = self.trained_classifiers[model_name]
        
        # Hacer predicción
        if model_name == 'RandomForest':
            # Random Forest no usa escalado
            casino_won_pred = model.predict(feature_vector)[0]
            casino_won_proba = model.predict_proba(feature_vector)[0]
        else:
            # KNN y MLP usan escalado
            casino_won_pred = model.predict(feature_vector_scaled)[0]
            casino_won_proba = model.predict_proba(feature_vector_scaled)[0]
        
        result = {
            'casino_won_prediction': bool(casino_won_pred),
            'casino_won_probability': float(casino_won_proba[1]),  # Probabilidad de que casino gane
            'player_win_probability': float(casino_won_proba[0]),  # Probabilidad de que jugador gane
            'model_used': model_name,
            'individual_predictions': {
                'time_on_device': predictions['time_on_device'],
                'bet_total': predictions['bet_total'],
                'win_total': predictions['win_total']
            }
        }
        
        return result
    
    def print_results_summary(self, results):
        """Imprime un resumen comparativo de todos los modelos"""
        print("\n" + "="*60)
        print("📊 RESUMEN COMPARATIVO DE MODELOS")
        print("="*60)
        
        summary_df = pd.DataFrame({
            'Modelo': list(results.keys()),
            'Accuracy': [results[model]['accuracy'] for model in results.keys()],
            'AUC-ROC': [results[model]['auc_score'] for model in results.keys()]
        })
        
        summary_df = summary_df.sort_values('AUC-ROC', ascending=False)
        print(summary_df.to_string(index=False))
        
        best_model = summary_df.iloc[0]['Modelo']
        print(f"\n🏆 Mejor modelo: {best_model}")
        print(f"   AUC-ROC: {summary_df.iloc[0]['AUC-ROC']:.4f}")
        print(f"   Accuracy: {summary_df.iloc[0]['Accuracy']:.4f}")
    
    def save_ensemble_models(self, save_path):
        """Guarda los modelos entrenados usando joblib"""
        import os
        
        # Crear directorio si no existe
        os.makedirs(save_path, exist_ok=True)
        
        for model_name, model in self.trained_classifiers.items():
            model_path = f"{save_path}/ensemble_{model_name.lower()}_model.pkl"
            joblib.dump(model, model_path)
            print(f"✅ Modelo {model_name} guardado en: {model_path}")
        
        # Guardar scaler con joblib
        scaler_path = f"{save_path}/ensemble_scaler.pkl"
        joblib.dump(self.ensemble_scaler, scaler_path)
        print(f"✅ Scaler guardado en: {scaler_path}")
    
    def load_ensemble_models(self, load_path):
        """Carga los modelos ensemble previamente entrenados"""
        try:
            # Cargar modelos ensemble
            model_files = {
                'KNN': f"{load_path}/ensemble_knn_model.pkl",
                'RandomForest': f"{load_path}/ensemble_randomforest_model.pkl", 
                'MLP': f"{load_path}/ensemble_mlp_model.pkl"
            }
            
            for model_name, model_path in model_files.items():
                try:
                    self.trained_classifiers[model_name] = joblib.load(model_path)
                    print(f"✅ Modelo {model_name} cargado desde: {model_path}")
                except FileNotFoundError:
                    print(f"⚠️ Modelo {model_name} no encontrado en: {model_path}")
            
            # Cargar scaler
            scaler_path = f"{load_path}/ensemble_scaler.pkl"
            self.ensemble_scaler = joblib.load(scaler_path)
            print(f"✅ Scaler cargado desde: {scaler_path}")
            
            print(f"🎯 Modelos ensemble cargados: {list(self.trained_classifiers.keys())}")
            
        except Exception as e:
            print(f"❌ Error cargando modelos ensemble: {e}")
            raise

# ============================================================================
# FUNCIONES DE USO Y EJEMPLOS
# ============================================================================

def main_train_ensemble_sampled():
    """
    Función principal para entrenar el ensemble con muestreo (RECOMENDADO)
    """
    # Configurar rutas de modelos (ajusta según tus rutas)
    model_paths = {
        'tiempo_model': 'path/to/tiempo_model.h5',
        'bet_model': 'path/to/bet_model.h5', 
        'win_model': 'path/to/win_model.h5',
        'tiempo_scaler': 'path/to/tiempo_scaler.pkl',
        'bet_scaler': 'path/to/bet_scaler.pkl',
        'win_scaler': 'path/to/win_scaler.pkl'
    }
    
    # Cargar tus datos (ajusta según tu dataset)
    # data = pd.read_csv('path/to/your/data.csv')
    
    # Inicializar modelo ensemble
    ensemble_model = CasinoEnsembleModelOptimized(model_paths)
    
    # Entrenar con muestreo estratificado (10K sesiones)
    print("🎯 Entrenamiento con muestreo estratificado...")
    results = ensemble_model.train_ensemble_models_sampled(
        data=data,  # Tu DataFrame
        sample_size=10000,  # Usar 10K sesiones en lugar de 200K+
        target_col='CASINO_WON',
        test_size=0.2
    )
    
    # Guardar modelos
    ensemble_model.save_ensemble_models('models/ensemble/')
    
    return ensemble_model, results

def main_train_ensemble_full():
    """
    Función principal para entrenar con dataset completo (OPTIMIZADO)
    """
    # Configurar rutas de modelos
    model_paths = {
        'tiempo_model': 'path/to/tiempo_model.h5',
        'bet_model': 'path/to/bet_model.h5', 
        'win_model': 'path/to/win_model.h5',
        'tiempo_scaler': 'path/to/tiempo_scaler.pkl',
        'bet_scaler': 'path/to/bet_scaler.pkl',
        'win_scaler': 'path/to/win_scaler.pkl'
    }
    
    # Cargar datos
    # data = pd.read_csv('path/to/your/data.csv')
    
    # Inicializar modelo ensemble
    ensemble_model = CasinoEnsembleModelOptimized(model_paths)
    
    # Entrenar con dataset completo (procesamiento batch optimizado)
    print("🚀 Entrenamiento con dataset completo (procesamiento batch)...")
    results = ensemble_model.train_ensemble_models(
        data=data,
        target_col='CASINO_WON',
        test_size=0.2
    )
    
    # Guardar modelos
    ensemble_model.save_ensemble_models('models/ensemble/')
    
    return ensemble_model, results

def example_predictions():
    """
    Ejemplo de cómo hacer predicciones
    """
    # Cargar modelo entrenado
    model_paths = {
        'tiempo_model': 'path/to/tiempo_model.h5',
        'bet_model': 'path/to/bet_model.h5', 
        'win_model': 'path/to/win_model.h5',
        'tiempo_scaler': 'path/to/tiempo_scaler.pkl',
        'bet_scaler': 'path/to/bet_scaler.pkl',
        'win_scaler': 'path/to/win_scaler.pkl'
    }
    
    ensemble_model = CasinoEnsembleModelOptimized(model_paths)
    
    # Cargar modelos ensemble pre-entrenados
    ensemble_model.load_ensemble_models('models/ensemble/')
    
    # Hacer predicciones individuales
    examples = [
        (100, 5, 0),    # $100 inicial, $5 apuesta promedio, cluster 0
        (500, 20, 1),   # $500 inicial, $20 apuesta promedio, cluster 1  
        (50, 2, 2),     # $50 inicial, $2 apuesta promedio, cluster 2
    ]
    
    print("🎲 Ejemplos de predicciones:")
    print("-" * 80)
    
    for initial, avg_bet, cluster in examples:
        result = ensemble_model.predict_casino_won(
            initial_amount=initial,
            avg_bet=avg_bet,
            cluster=cluster,
            model_name='RandomForest'  # o 'KNN', 'MLP', 'best'
        )
        
        print(f"\n💰 Sesión: ${initial} inicial, ${avg_bet} apuesta promedio, cluster {cluster}")
        print(f"   🎯 Casino ganará: {result['casino_won_prediction']}")
        print(f"   📊 Probabilidad casino: {result['casino_won_probability']:.3f}")
        print(f"   📊 Probabilidad jugador: {result['player_win_probability']:.3f}")
        print(f"   🤖 Modelo usado: {result['model_used']}")
        print(f"   📈 Predicciones individuales:")
        print(f"       Tiempo en dispositivo: {result['individual_predictions']['time_on_device']:.1f} min")
        print(f"       Apuesta total: ${result['individual_predictions']['bet_total']:.2f}")
        print(f"       Ganancia total: ${result['individual_predictions']['win_total']:.2f}")

# ============================================================================
# INSTRUCCIONES DE USO
# ============================================================================

"""
CÓMO USAR ESTE CÓDIGO:

1. OPCIÓN RÁPIDA (RECOMENDADA) - Muestreo estratificado:
   
   ensemble_model, results = main_train_ensemble_sampled()
   
   - Usa solo 10,000 sesiones representativas
   - Tiempo estimado: 15-30 minutos
   - Mantiene distribución de clases
   - Ideal para desarrollo y pruebas

2. OPCIÓN COMPLETA - Dataset completo optimizado:
   
   ensemble_model, results = main_train_ensemble_full()
   
   - Usa todo el dataset con procesamiento batch
   - Tiempo estimado: 1-2 horas (vs 4+ horas original)
   - Mejor rendimiento final
   - Para producción

3. HACER PREDICCIONES:
   
   example_predictions()
   
   - Requiere modelos ya entrenados
   - Predicciones instantáneas

CONFIGURACIÓN NECESARIA:

1. Actualizar model_paths con tus rutas reales:
   model_paths = {
       'tiempo_model': 'tu/ruta/tiempo_model.h5',
       'bet_model': 'tu/ruta/bet_model.h5', 
       'win_model': 'tu/ruta/win_model.h5',
       'tiempo_scaler': 'tu/ruta/tiempo_scaler.pkl',
       'bet_scaler': 'tu/ruta/bet_scaler.pkl',
       'win_scaler': 'tu/ruta/win_scaler.pkl'
   }

2. Cargar tu dataset:
   data = pd.read_csv('tu/dataset.csv')
   
3. Verificar columnas requeridas:
   - INITIAL_AMOUNT
   - AVG_BET  
   - Cluster
   - CASINO_WON (target)

MEJORAS INCLUIDAS:

✅ Procesamiento batch (1000x más rápido)
✅ Muestreo estratificado 
✅ Progreso en tiempo real
✅ Manejo robusto de errores
✅ Guardado/carga optimizada
✅ Estadísticas detalladas
✅ Comparación de modelos
✅ Predicciones individuales
✅ Documentación completa
"""

if __name__ == "__main__":
    # Descomentar la opción que prefieras:
    
    # OPCIÓN 1: Entrenamiento rápido con muestra
    # ensemble_model, results = main_train_ensemble_sampled()
    
    # OPCIÓN 2: Entrenamiento completo optimizado  
    # ensemble_model, results = main_train_ensemble_full()
    
    # OPCIÓN 3: Hacer predicciones (requiere modelos entrenados)
    # example_predictions()
    
    print("🚀 Ensemble Model listo para usar!")
    print("📖 Lee las instrucciones en los comentarios para comenzar.")

🚀 Ensemble Model listo para usar!
📖 Lee las instrucciones en los comentarios para comenzar.


In [5]:
df_train = df_data_general[['INITIAL_AMOUNT', 'AVG_BET', 'Cluster', 'Weekday', 'Weekend', 'Month', 'CASINO_WON']]

In [6]:
df_train['CASINO_WON'].value_counts()

True     190755
False     18324
Name: CASINO_WON, dtype: int64

In [7]:
from sklearn.utils import resample

# Separar las clases
casino_won_true = df_train[df_train['CASINO_WON'] == True]
casino_won_false = df_train[df_train['CASINO_WON'] == False]

# Tomar una muestra aleatoria de la clase mayoritaria del tamaño de la minoritaria
casino_won_true_sampled = resample(casino_won_true, 
                                   replace=False,  # sin reemplazo
                                   n_samples=len(casino_won_false) + 1000,  # mismo tamaño que False
                                   random_state=42)  # para reproducibilidad

# Combinar las clases
df_balanced = pd.concat([casino_won_true_sampled, casino_won_false])

# Mezclar el dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_balanced['CASINO_WON'].value_counts()

True     19324
False    18324
Name: CASINO_WON, dtype: int64

In [8]:
model_paths = {
    'tiempo_model': '../../../models/tiempo_model.h5',
    'tiempo_scaler': '../../../models/tiempo_scaler.pkl',
    'bet_model': '../../../models/bet_model.h5',
    'bet_scaler': '../../../models/bet_scaler.pkl',
    'win_model': '../../../models/win_model.h5',
    'win_scaler': '../../../models/win_scaler.pkl'
}

# Crear instancia del modelo ensemble
ensemble_model = CasinoEnsembleModelOptimized(model_paths)
results = ensemble_model.train_ensemble_models(df_balanced)

2025-09-23 09:48:48.190255: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


✅ Todos los modelos y scalers cargados correctamente
🚀 Iniciando entrenamiento de modelos ensemble...
📝 Las features time_on_device, bet_total, win_total se generarán desde los modelos pre-entrenados
📋 Columnas disponibles: ['INITIAL_AMOUNT', 'AVG_BET', 'Cluster', 'Weekday', 'Weekend', 'Month', 'CASINO_WON']
📊 Dataset shape: (37648, 7)
🔄 Generando predicciones usando modelos individuales (procesamiento batch)...
🔄 Procesando 37648 sesiones en 38 lotes de 1000...
📦 Procesando lote 1/38 (1000 sesiones)...


2025-09-23 09:48:53.743270: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


✅ Lote 1 completado. Progreso: 2.7%
📦 Procesando lote 2/38 (1000 sesiones)...
✅ Lote 2 completado. Progreso: 5.3%
📦 Procesando lote 3/38 (1000 sesiones)...
✅ Lote 3 completado. Progreso: 8.0%
📦 Procesando lote 4/38 (1000 sesiones)...
✅ Lote 4 completado. Progreso: 10.6%
📦 Procesando lote 5/38 (1000 sesiones)...
✅ Lote 5 completado. Progreso: 13.3%
📦 Procesando lote 6/38 (1000 sesiones)...
✅ Lote 6 completado. Progreso: 15.9%
📦 Procesando lote 7/38 (1000 sesiones)...
✅ Lote 7 completado. Progreso: 18.6%
📦 Procesando lote 8/38 (1000 sesiones)...
✅ Lote 8 completado. Progreso: 21.2%
📦 Procesando lote 9/38 (1000 sesiones)...
✅ Lote 9 completado. Progreso: 23.9%
📦 Procesando lote 10/38 (1000 sesiones)...
✅ Lote 10 completado. Progreso: 26.6%
📦 Procesando lote 11/38 (1000 sesiones)...
✅ Lote 11 completado. Progreso: 29.2%
📦 Procesando lote 12/38 (1000 sesiones)...
✅ Lote 12 completado. Progreso: 31.9%
📦 Procesando lote 13/38 (1000 sesiones)...
✅ Lote 13 completado. Progreso: 34.5%
📦 Procesan

In [11]:
ensemble_model.save_ensemble_models('modelo.pkl')

✅ Modelo KNN guardado en: modelo.pkl/ensemble_knn_model.pkl
✅ Modelo RandomForest guardado en: modelo.pkl/ensemble_randomforest_model.pkl
✅ Modelo MLP guardado en: modelo.pkl/ensemble_mlp_model.pkl
✅ Scaler guardado en: modelo.pkl/ensemble_scaler.pkl
