In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
import joblib
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
df_data_general = pd.read_csv('../../../data/data_general.csv')

df_data_general['INITIAL_TIME'] = pd.to_datetime(df_data_general['INITIAL_TIME'])
df_data_general['FINAL_TIME'] = pd.to_datetime(df_data_general['FINAL_TIME'])

df_data_general['INITIAL_TIME'] = df_data_general['INITIAL_TIME'].dt.to_period('D')
df_data_general['INITIAL_TIME'] = df_data_general['INITIAL_TIME'].dt.to_timestamp()


df_data_general['FINAL_TIME'] = df_data_general['FINAL_TIME'].dt.to_period('D')
df_data_general['FINAL_TIME'] = df_data_general['FINAL_TIME'].dt.to_timestamp()

df_data_general['Weekday']= df_data_general['INITIAL_TIME'].dt.strftime('%A')
df_data_general['number_of_day'] = df_data_general['INITIAL_TIME'].dt.day_of_week

df_data_general['TIME_ON_DEVICE_MIN'] = df_data_general['TIME_ON_DEVICE_SEC'] / 60

df_data_general['Hour'] = df_data_general['INITIAL_TIME'].dt.hour
df_data_general['Weekday'] = df_data_general['INITIAL_TIME'].dt.weekday   # 0=Lunes, 6=Domingo
df_data_general['Weekend'] = (df_data_general['Weekday'] >= 5).astype(int)
df_data_general['Month'] = df_data_general['INITIAL_TIME'].dt.month

df_data_general = df_data_general[df_data_general['TIME_ON_DEVICE_MIN'] < 600 ]

df_data_general = df_data_general[df_data_general['WIN_TOTAL'] > 0]
df_data_general['NET_SPEND'] = df_data_general['FINAL_AMOUNT'] - df_data_general['INITIAL_AMOUNT']
df_data_general = df_data_general[df_data_general['NET_SPEND'] < 10000 ]

In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils import resample
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

class CasinoEnsembleRegressionOptimized:
    def __init__(self, model_paths):
        """
        Inicializa el modelo ensemble de regresión cargando los modelos pre-entrenados
        
        Args:
            model_paths (dict): Diccionario con las rutas de los modelos y scalers
        """
        self.model_paths = model_paths
        self.models = {}
        self.scalers = {}
        self.ensemble_scaler = StandardScaler()
        self.target_scaler = StandardScaler()  # Para escalar el target en algunos modelos
        self.trained_regressors = {}
        
        # Cargar modelos pre-entrenados
        self.load_pretrained_models()
    
    def load_pretrained_models(self):
        """Carga los modelos y scalers pre-entrenados"""
        try:
            # Cargar modelos de Keras
            self.models['time'] = load_model(self.model_paths['tiempo_model'])
            self.models['bet'] = load_model(self.model_paths['bet_model'])
            self.models['win'] = load_model(self.model_paths['win_model'])
            
            # Cargar scalers con joblib
            self.scalers['time'] = joblib.load(self.model_paths['tiempo_scaler'])
            self.scalers['bet'] = joblib.load(self.model_paths['bet_scaler'])
            self.scalers['win'] = joblib.load(self.model_paths['win_scaler'])
            
            print("✅ Todos los modelos y scalers cargados correctamente")
            
        except Exception as e:
            print(f"❌ Error cargando modelos: {e}")
            raise
    
    def create_business_features(self, base_data, tiempo_pred=None, bet_pred=None):
        """
        Crea las features de negocio necesarias para los modelos
        """
        features = base_data[['INITIAL_AMOUNT', 'AVG_BET', 'Cluster']].copy()
        
        if tiempo_pred is not None:
            features['tiempo_pred'] = tiempo_pred
            
        if bet_pred is not None:
            features['bet_pred'] = bet_pred
            features['total_money_handled'] = bet_pred
            features['house_edge_effect'] = bet_pred * 0.05
            features['money_multiplier'] = bet_pred / (base_data['INITIAL_AMOUNT'] + 1)
            features['reinvestment_indicator'] = np.where(bet_pred > base_data['INITIAL_AMOUNT'], 1, 0)
            features['excess_betting'] = np.maximum(0, bet_pred - base_data['INITIAL_AMOUNT'])
            features['money_at_risk'] = np.minimum(bet_pred, base_data['INITIAL_AMOUNT'])
            features['cluster_risk_adjusted'] = base_data['Cluster'] * features['money_multiplier']
            
        return features
    
    def predict_session(self, initial_amount, avg_bet, cluster, weekday=1, weekend=0, month=1):
        """
        Hacer predicción completa para una sesión usando los modelos MLP en secuencia
        """
        if not self.models:
            return {"error": "Modelos no cargados correctamente"}
        
        # Preparar datos base
        base_data = pd.DataFrame({
            'INITIAL_AMOUNT': [initial_amount],
            'AVG_BET': [avg_bet], 
            'Cluster': [cluster],
            'Weekday': [weekday],
            'Weekend': [weekend],
            'Month': [month]
        })
        
        try:
            results = {}
            
            # 1. Predecir TIEMPO
            if 'time' in self.models:
                try:
                    X_tiempo = self.create_business_features(base_data)
                    X_tiempo_scaled = self.scalers['time'].transform(X_tiempo)
                    tiempo_pred = float(self.models['time'].predict(X_tiempo_scaled, verbose=0)[0][0])
                    tiempo_pred = max(0, tiempo_pred)
                    results['time_on_device'] = tiempo_pred
                except Exception as e:
                    print(f"Error prediciendo tiempo: {e}")
                    tiempo_pred = 30.0
                    results['time_on_device'] = tiempo_pred
            else:
                tiempo_pred = 30.0
                results['time_on_device'] = tiempo_pred
            
            # 2. Predecir BET TOTAL
            if 'bet' in self.models:
                try:
                    X_bet = self.create_business_features(base_data, tiempo_pred=tiempo_pred)
                    X_bet_scaled = self.scalers['bet'].transform(X_bet)
                    bet_pred = float(self.models['bet'].predict(X_bet_scaled, verbose=0)[0][0])
                    bet_pred = max(0, bet_pred)
                    results['bet_total'] = bet_pred
                except Exception as e:
                    print(f"Error prediciendo bet: {e}")
                    bet_pred = initial_amount * 2
                    results['bet_total'] = bet_pred
            else:
                bet_pred = initial_amount * 2
                results['bet_total'] = bet_pred
            
            # 3. Predecir WIN TOTAL
            if 'win' in self.models:
                try:
                    X_win = self.create_business_features(base_data, tiempo_pred=tiempo_pred, bet_pred=bet_pred)
                    X_win_scaled = self.scalers['win'].transform(X_win)
                    win_pred = float(self.models['win'].predict(X_win_scaled, verbose=0)[0][0])
                    win_pred = max(0, win_pred)
                    results['win_total'] = win_pred
                except Exception as e:
                    print(f"Error prediciendo win: {e}")
                    win_pred = bet_pred * 0.95
                    results['win_total'] = win_pred
            else:
                win_pred = bet_pred * 0.95
                results['win_total'] = win_pred
            
            return results
            
        except Exception as e:
            return {"error": f"Error en predicción: {str(e)}"}
    
    def generate_ensemble_features_batch(self, input_data, batch_size=1000):
        """
        Versión OPTIMIZADA que procesa en lotes para mejor rendimiento
        """
        ensemble_features = []
        total_batches = len(input_data) // batch_size + (1 if len(input_data) % batch_size > 0 else 1)
        
        print(f"🔄 Procesando {len(input_data)} sesiones en {total_batches} lotes de {batch_size}...")
        
        for batch_idx in range(0, len(input_data), batch_size):
            batch_end = min(batch_idx + batch_size, len(input_data))
            batch_data = input_data.iloc[batch_idx:batch_end]
            
            current_batch = (batch_idx // batch_size) + 1
            print(f"📦 Procesando lote {current_batch}/{total_batches} ({len(batch_data)} sesiones)...")
            
            # Procesar tiempo en batch
            X_tiempo_batch = []
            for _, row in batch_data.iterrows():
                base_data = pd.DataFrame({
                    'INITIAL_AMOUNT': [row['INITIAL_AMOUNT']],
                    'AVG_BET': [row['AVG_BET']],
                    'Cluster': [row['Cluster']]
                })
                X_tiempo = self.create_business_features(base_data)
                X_tiempo_batch.append(X_tiempo.iloc[0].values)
            
            X_tiempo_batch = np.array(X_tiempo_batch)
            X_tiempo_scaled = self.scalers['time'].transform(X_tiempo_batch)
            tiempo_preds = self.models['time'].predict(X_tiempo_scaled, verbose=0).flatten()
            tiempo_preds = np.maximum(0, tiempo_preds)
            
            # Procesar bet en batch
            X_bet_batch = []
            for i, (_, row) in enumerate(batch_data.iterrows()):
                base_data = pd.DataFrame({
                    'INITIAL_AMOUNT': [row['INITIAL_AMOUNT']],
                    'AVG_BET': [row['AVG_BET']],
                    'Cluster': [row['Cluster']]
                })
                X_bet = self.create_business_features(base_data, tiempo_pred=tiempo_preds[i])
                X_bet_batch.append(X_bet.iloc[0].values)
            
            X_bet_batch = np.array(X_bet_batch)
            X_bet_scaled = self.scalers['bet'].transform(X_bet_batch)
            bet_preds = self.models['bet'].predict(X_bet_scaled, verbose=0).flatten()
            bet_preds = np.maximum(0, bet_preds)
            
            # Procesar win en batch
            X_win_batch = []
            for i, (_, row) in enumerate(batch_data.iterrows()):
                base_data = pd.DataFrame({
                    'INITIAL_AMOUNT': [row['INITIAL_AMOUNT']],
                    'AVG_BET': [row['AVG_BET']],
                    'Cluster': [row['Cluster']]
                })
                X_win = self.create_business_features(base_data, tiempo_pred=tiempo_preds[i], bet_pred=bet_preds[i])
                X_win_batch.append(X_win.iloc[0].values)
            
            X_win_batch = np.array(X_win_batch)
            X_win_scaled = self.scalers['win'].transform(X_win_batch)
            win_preds = self.models['win'].predict(X_win_scaled, verbose=0).flatten()
            win_preds = np.maximum(0, win_preds)
            
            # Crear features del ensemble para este batch
            for i, (_, row) in enumerate(batch_data.iterrows()):
                feature_vector = {
                    'time_on_device': tiempo_preds[i],
                    'bet_total': bet_preds[i],
                    'win_total': win_preds[i],
                    'initial_amount': row['INITIAL_AMOUNT'],
                    'cluster': row['Cluster'],
                    'avg_bet': row['AVG_BET']
                }
                ensemble_features.append(feature_vector)
            
            # Progreso
            progress = min(((batch_idx + batch_size) / len(input_data)) * 100, 100)
            print(f"✅ Lote {current_batch} completado. Progreso: {progress:.1f}%")
        
        result_df = pd.DataFrame(ensemble_features)
        
        print(f"✅ Features del ensemble generadas exitosamente:")
        print(f"   - {len(result_df)} sesiones procesadas")
        print(f"   - Variables generadas por modelos MLP: time_on_device, bet_total, win_total")
        print(f"   - Variables originales: initial_amount, cluster, avg_bet")
        print(f"   - Total features para regresión: {result_df.shape[1]}")
        
        # Mostrar estadísticas de las predicciones generadas
        print(f"\n📊 Estadísticas de predicciones generadas:")
        print(f"   time_on_device: {result_df['time_on_device'].mean():.2f} ± {result_df['time_on_device'].std():.2f}")
        print(f"   bet_total: ${result_df['bet_total'].mean():.2f} ± ${result_df['bet_total'].std():.2f}")
        print(f"   win_total: ${result_df['win_total'].mean():.2f} ± ${result_df['win_total'].std():.2f}")
        
        return result_df
    
    def generate_ensemble_features(self, input_data):
        """
        Wrapper que usa la versión batch optimizada
        """
        return self.generate_ensemble_features_batch(input_data, batch_size=1000)
    
    def prepare_ensemble_features(self, data, target_col='GAMES_PLAYED_TOTAL'):
        """
        Prepara las features para el modelo ensemble usando predicciones batch
        """
        # Verificar que tenemos las features base necesarias
        required_base_features = ['INITIAL_AMOUNT', 'AVG_BET', 'Cluster']
        missing_base = [f for f in required_base_features if f not in data.columns]
        if missing_base:
            raise ValueError(f"Features base faltantes para generar predicciones: {missing_base}")
        
        # Verificar que tenemos la columna objetivo
        if target_col not in data.columns:
            raise ValueError(f"Columna objetivo '{target_col}' no encontrada")
        
        print("🔄 Generando predicciones usando modelos individuales (procesamiento batch)...")
        X = self.generate_ensemble_features(data)
        y = data[target_col].iloc[:len(X)]  # Asegurar mismo length en caso de filas fallidas
        
        return X, y
    
    def train_ensemble_models_sampled(self, data, target_col='GAMES_PLAYED_TOTAL', sample_size=10000, test_size=0.2):
        """
        Entrena usando una muestra aleatoria para acelerar el proceso
        """
        print(f"🎯 Usando muestreo aleatorio de {sample_size} sesiones de {len(data)} totales")
        
        # Muestreo aleatorio simple
        if len(data) > sample_size:
            sampled_data = resample(data, n_samples=sample_size, random_state=42, replace=False)
        else:
            sampled_data = data.copy()
            sample_size = len(sampled_data)
        
        print(f"📊 Muestra seleccionada: {len(sampled_data)} sesiones")
        print(f"📊 Distribución del target:")
        print(f"   Media: {sampled_data[target_col].mean():.2f}")
        print(f"   Mediana: {sampled_data[target_col].median():.2f}")
        print(f"   Std: {sampled_data[target_col].std():.2f}")
        print(f"   Min: {sampled_data[target_col].min()}")
        print(f"   Max: {sampled_data[target_col].max()}")
        
        # Entrenar con la muestra
        return self.train_ensemble_models(sampled_data, target_col, test_size)
    
    def create_rnn_model(self, input_shape, target_mean=None, target_std=None):
        """Crea modelo RNN/LSTM para regresión"""
        model = Sequential([
            LSTM(64, return_sequences=True, input_shape=(input_shape, 1)),
            Dropout(0.2),
            LSTM(32, return_sequences=False),
            Dropout(0.2),
            Dense(16, activation='relu'),
            Dense(1, activation='linear')  # Para regresión
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001), 
                     loss='mse', 
                     metrics=['mae'])
        
        return model
    
    def train_mlp_model(self, X_train, y_train, X_test, y_test):
        """Entrena y evalúa modelo MLP para regresión"""
        print("\n🧠 Entrenando modelo MLP...")
        
        # Parámetros para MLP
        param_grid = {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
            'activation': ['relu', 'tanh'],
            'learning_rate_init': [0.001, 0.01],
            'alpha': [0.0001, 0.001, 0.01],
            'max_iter': [1000]
        }
        
        mlp = MLPRegressor(random_state=42)
        grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_mlp = grid_search.best_estimator_
        self.trained_regressors['MLP'] = best_mlp
        
        # Predicciones
        y_pred = best_mlp.predict(X_test)
        
        # Métricas
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        print(f"Mejores parámetros: {grid_search.best_params_}")
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"R²: {r2:.4f}")
        
        return {
            'model': best_mlp,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2_score': r2,
            'predictions': y_pred
        }
    
    def train_random_forest_model(self, X_train, y_train, X_test, y_test):
        """Entrena y evalúa modelo Random Forest para regresión"""
        print("\n🌳 Entrenando modelo Random Forest...")
        
        # Parámetros para Random Forest
        param_grid = {
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }
        
        rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
        grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_rf = grid_search.best_estimator_
        self.trained_regressors['RandomForest'] = best_rf
        
        # Predicciones
        y_pred = best_rf.predict(X_test)
        
        # Métricas
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        print(f"Mejores parámetros: {grid_search.best_params_}")
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"R²: {r2:.4f}")
        
        # Importancia de features
        feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': best_rf.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nImportancia de features:")
        print(feature_importance)
        
        return {
            'model': best_rf,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2_score': r2,
            'predictions': y_pred,
            'feature_importance': feature_importance
        }
    
    def train_xgboost_model(self, X_train, y_train, X_test, y_test):
        """Entrena y evalúa modelo XGBoost para regresión"""
        print("\n🚀 Entrenando modelo XGBoost...")
        
        # Parámetros para XGBoost
        param_grid = {
            'max_depth': [3, 6, 10],
            'learning_rate': [0.01, 0.1, 0.2],
            'n_estimators': [100, 200, 300],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
        
        xgb_reg = xgb.XGBRegressor(random_state=42, n_jobs=-1)
        grid_search = GridSearchCV(xgb_reg, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_xgb = grid_search.best_estimator_
        self.trained_regressors['XGBoost'] = best_xgb
        
        # Predicciones
        y_pred = best_xgb.predict(X_test)
        
        # Métricas
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        print(f"Mejores parámetros: {grid_search.best_params_}")
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"R²: {r2:.4f}")
        
        # Importancia de features
        feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': best_xgb.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nImportancia de features:")
        print(feature_importance)
        
        return {
            'model': best_xgb,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2_score': r2,
            'predictions': y_pred,
            'feature_importance': feature_importance
        }
    
    def train_rnn_model(self, X_train, y_train, X_test, y_test):
        """Entrena y evalúa modelo RNN para regresión"""
        print("\n🔄 Entrenando modelo RNN/LSTM...")
        
        # Ajustar datos para RNN (necesita 3D: [samples, timesteps, features])
        X_train_rnn = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
        X_test_rnn = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))
        
        # Escalar target para mejor entrenamiento
        y_train_scaled = self.target_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
        
        # Crear y entrenar modelo
        rnn_model = self.create_rnn_model(X_train.shape[1])
        
        # Callbacks
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        
        # Entrenar
        history = rnn_model.fit(
            X_train_rnn, y_train_scaled,
            epochs=100,
            batch_size=32,
            validation_split=0.2,
            callbacks=[early_stopping],
            verbose=0
        )
        
        self.trained_regressors['RNN'] = rnn_model
        
        # Predicciones
        y_pred_scaled = rnn_model.predict(X_test_rnn, verbose=0).flatten()
        y_pred = self.target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
        
        # Métricas
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"MAE: {mae:.4f}")
        print(f"R²: {r2:.4f}")
        print(f"Épocas entrenadas: {len(history.history['loss'])}")
        
        return {
            'model': rnn_model,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2_score': r2,
            'predictions': y_pred,
            'history': history
        }
    
    def train_ensemble_models(self, data, target_col='GAMES_PLAYED_TOTAL', test_size=0.2):
        """
        Entrena todos los modelos ensemble usando las predicciones de los modelos individuales
        """
        print("🚀 Iniciando entrenamiento de modelos ensemble para regresión...")
        print("🎯 Target: Predicción de GAMES_PLAYED_TOTAL")
        print("📝 Las features time_on_device, bet_total, win_total se generarán desde los modelos pre-entrenados")
        
        # Verificar columnas requeridas
        required_cols = ['INITIAL_AMOUNT', 'AVG_BET', 'Cluster', target_col]
        missing_cols = [col for col in required_cols if col not in data.columns]
        if missing_cols:
            raise ValueError(f"Columnas requeridas faltantes: {missing_cols}")
        
        print(f"📋 Columnas disponibles: {list(data.columns)}")
        print(f"📊 Dataset shape: {data.shape}")
        
        # Preparar features (genera predicciones desde modelos MLP usando batch processing)
        X, y = self.prepare_ensemble_features(data, target_col)
        
        print(f"📊 Datos preparados: {X.shape[0]} muestras, {X.shape[1]} features")
        print(f"📈 Estadísticas del target:")
        print(f"   Media: {y.mean():.2f}")
        print(f"   Mediana: {y.median():.2f}")
        print(f"   Std: {y.std():.2f}")
        print(f"   Min: {y.min()}, Max: {y.max()}")
        print(f"🎯 Features del ensemble: {list(X.columns)}")
        
        # Dividir datos
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )
        
        # Escalar features
        X_train_scaled = self.ensemble_scaler.fit_transform(X_train)
        X_test_scaled = self.ensemble_scaler.transform(X_test)
        
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
        
        # Entrenar modelos
        results = {}
        
        print("\n" + "="*60)
        print("MLP Start")
        results['MLP'] = self.train_mlp_model(X_train_scaled, y_train, X_test_scaled, y_test)
        print("MLP End")

        print("\n" + "="*60)
        print("RF Start")
        results['RandomForest'] = self.train_random_forest_model(X_train, y_train, X_test, y_test)
        print("RF End")

        print("\n" + "="*60)
        print("XGBoost Start")
        results['XGBoost'] = self.train_xgboost_model(X_train, y_train, X_test, y_test)
        print("XGBoost End")

        print("\n" + "="*60)
        print("RNN Start")
        results['RNN'] = self.train_rnn_model(X_train_scaled, y_train, X_test_scaled, y_test)
        print("RNN End")
        
        # Resumen de resultados
        self.print_results_summary(results)
        
        return results
    
    def predict_games_played(self, initial_amount, avg_bet, cluster, weekday=1, weekend=0, month=1, model_name='best'):
        """
        Predice el número de juegos que se jugarán para una sesión específica
        """
        # Generar predicciones de los modelos individuales
        predictions = self.predict_session(initial_amount, avg_bet, cluster, weekday, weekend, month)
        
        if 'error' in predictions:
            return predictions
        
        # Crear feature vector para ensemble
        feature_vector = pd.DataFrame({
            'time_on_device': [predictions['time_on_device']],
            'bet_total': [predictions['bet_total']],
            'win_total': [predictions['win_total']],
            'initial_amount': [initial_amount],
            'cluster': [cluster],
            'avg_bet': [avg_bet]
        })
        
        # Seleccionar modelo
        if model_name == 'best':
            # Seleccionar el modelo con mejor R² (por defecto RandomForest)
            model_name = 'RandomForest'
        
        if model_name not in self.trained_regressors:
            return {"error": f"Modelo {model_name} no encontrado"}
        
        model = self.trained_regressors[model_name]
        
        # Hacer predicción
        if model_name in ['MLP', 'RNN']:
            # MLP y RNN usan escalado
            feature_vector_scaled = self.ensemble_scaler.transform(feature_vector)
            feature_vector_scaled = pd.DataFrame(feature_vector_scaled, columns=feature_vector.columns)
            
            if model_name == 'RNN':
                # RNN necesita formato 3D
                feature_vector_rnn = feature_vector_scaled.values.reshape((1, feature_vector_scaled.shape[1], 1))
                games_pred_scaled = model.predict(feature_vector_rnn, verbose=0)[0][0]
                games_pred = self.target_scaler.inverse_transform([[games_pred_scaled]])[0][0]
            else:
                games_pred = model.predict(feature_vector_scaled)[0]
        else:
            # RandomForest y XGBoost no necesitan escalado
            games_pred = model.predict(feature_vector)[0]
        
        # Asegurar que la predicción sea positiva
        games_pred = max(0, games_pred)
        
        result = {
            'games_played_prediction': float(games_pred),
            'model_used': model_name,
            'individual_predictions': {
                'time_on_device': predictions['time_on_device'],
                'bet_total': predictions['bet_total'],
                'win_total': predictions['win_total']
            }
        }
        
        return result
    
    def print_results_summary(self, results):
        """Imprime un resumen comparativo de todos los modelos"""
        print("\n" + "="*80)
        print("📊 RESUMEN COMPARATIVO DE MODELOS DE REGRESIÓN")
        print("="*80)
        
        summary_data = []
        for model in results.keys():
            summary_data.append({
                'Modelo': model,
                'RMSE': results[model]['rmse'],
                'MAE': results[model]['mae'],
                'R²': results[model]['r2_score'],
                'MSE': results[model]['mse']
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df.sort_values('R²', ascending=False)
        
        print(summary_df.round(4).to_string(index=False))
        
        best_model = summary_df.iloc[0]['Modelo']
        print(f"\n🏆 Mejor modelo: {best_model}")
        print(f"   R²: {summary_df.iloc[0]['R²']:.4f}")
        print(f"   RMSE: {summary_df.iloc[0]['RMSE']:.4f}")
        print(f"   MAE: {summary_df.iloc[0]['MAE']:.4f}")
        
        print(f"\n📈 Interpretación de R²:")
        best_r2 = summary_df.iloc[0]['R²']
        if best_r2 >= 0.9:
            print("   ✅ Excelente ajuste (R² ≥ 0.9)")
        elif best_r2 >= 0.7:
            print("   ✅ Buen ajuste (0.7 ≤ R² < 0.9)")
        elif best_r2 >= 0.5:
            print("   ⚠️ Ajuste moderado (0.5 ≤ R² < 0.7)")
        else:
            print("   ❌ Ajuste pobre (R² < 0.5)")
    
    def save_ensemble_models(self, save_path):
        """Guarda los modelos entrenados"""
        import os
        
        # Crear directorio si no existe
        os.makedirs(save_path, exist_ok=True)
        
        for model_name, model in self.trained_regressors.items():
            if model_name == 'RNN':
                # Guardar modelo RNN (Keras)
                model_path = f"{save_path}/ensemble_{model_name.lower()}_model.h5"
                model.save(model_path)
                print(f"✅ Modelo {model_name} guardado en: {model_path}")
            else:
                # Guardar otros modelos (scikit-learn/XGBoost)
                model_path = f"{save_path}/ensemble_{model_name.lower()}_model.pkl"
                joblib.dump(model, model_path)
                print(f"✅ Modelo {model_name} guardado en: {model_path}")
        
        # Guardar scalers
        scaler_path = f"{save_path}/ensemble_scaler.pkl"
        joblib.dump(self.ensemble_scaler, scaler_path)
        print(f"✅ Ensemble scaler guardado en: {scaler_path}")
        
        target_scaler_path = f"{save_path}/target_scaler.pkl"
        joblib.dump(self.target_scaler, target_scaler_path)
        print(f"✅ Target scaler guardado en: {target_scaler_path}")
    
    def load_ensemble_models(self, load_path):
        """Carga los modelos ensemble previamente entrenados"""
        try:
            import tensorflow as tf
            
            # Cargar modelos ensemble
            model_files = {
                'MLP': f"{load_path}/ensemble_mlp_model.pkl",
                'RandomForest': f"{load_path}/ensemble_randomforest_model.pkl", 
                'XGBoost': f"{load_path}/ensemble_xgboost_model.pkl",
                'RNN': f"{load_path}/ensemble_rnn_model.h5"
            }
            
            for model_name, model_path in model_files.items():
                try:
                    if model_name == 'RNN':
                        self.trained_regressors[model_name] = tf.keras.models.load_model(model_path)
                    else:
                        self.trained_regressors[model_name] = joblib.load(model_path)
                    print(f"✅ Modelo {model_name} cargado desde: {model_path}")
                except FileNotFoundError:
                    print(f"⚠️ Modelo {model_name} no encontrado en: {model_path}")
                except Exception as e:
                    print(f"❌ Error cargando {model_name}: {e}")
            
            # Cargar scalers
            try:
                scaler_path = f"{load_path}/ensemble_scaler.pkl"
                self.ensemble_scaler = joblib.load(scaler_path)
                print(f"✅ Ensemble scaler cargado desde: {scaler_path}")
            except Exception as e:
                print(f"❌ Error cargando ensemble scaler: {e}")
            
            try:
                target_scaler_path = f"{load_path}/target_scaler.pkl"
                self.target_scaler = joblib.load(target_scaler_path)
                print(f"✅ Target scaler cargado desde: {target_scaler_path}")
            except Exception as e:
                print(f"❌ Error cargando target scaler: {e}")
            
            print(f"🎯 Modelos ensemble cargados: {list(self.trained_regressors.keys())}")
            
        except Exception as e:
            print(f"❌ Error cargando modelos ensemble: {e}")
            raise

# ============================================================================
# FUNCIONES DE USO Y EJEMPLOS
# ============================================================================

def main_train_ensemble_sampled():
    """
    Función principal para entrenar el ensemble con muestreo (RECOMENDADO)
    """
    # Configurar rutas de modelos (ajusta según tus rutas)
    model_paths = {
        'tiempo_model': 'path/to/tiempo_model.h5',
        'bet_model': 'path/to/bet_model.h5', 
        'win_model': 'path/to/win_model.h5',
        'tiempo_scaler': 'path/to/tiempo_scaler.pkl',
        'bet_scaler': 'path/to/bet_scaler.pkl',
        'win_scaler': 'path/to/win_scaler.pkl'
    }
    
    # Cargar tus datos (ajusta según tu dataset)
    # data = pd.read_csv('path/to/your/data.csv')
    
    # Inicializar modelo ensemble
    ensemble_model = CasinoEnsembleRegressionOptimized(model_paths)
    
    # Entrenar con muestreo estratificado (10K sesiones)
    print("🎯 Entrenamiento con muestreo para regresión...")
    results = ensemble_model.train_ensemble_models_sampled(
        data=data,  # Tu DataFrame
        sample_size=10000,  # Usar 10K sesiones en lugar de 200K+
        target_col='GAMES_PLAYED_TOTAL',
        test_size=0.2
    )
    
    # Guardar modelos
    ensemble_model.save_ensemble_models('models/ensemble_regression/')
    
    return ensemble_model, results

def main_train_ensemble_full():
    """
    Función principal para entrenar con dataset completo (OPTIMIZADO)
    """
    # Configurar rutas de modelos
    model_paths = {
        'tiempo_model': 'path/to/tiempo_model.h5',
        'bet_model': 'path/to/bet_model.h5', 
        'win_model': 'path/to/win_model.h5',
        'tiempo_scaler': 'path/to/tiempo_scaler.pkl',
        'bet_scaler': 'path/to/bet_scaler.pkl',
        'win_scaler': 'path/to/win_scaler.pkl'
    }
    
    # Cargar datos
    # data = pd.read_csv('path/to/your/data.csv')
    
    # Inicializar modelo ensemble
    ensemble_model = CasinoEnsembleRegressionOptimized(model_paths)
    
    # Entrenar con dataset completo (procesamiento batch optimizado)
    print("🚀 Entrenamiento con dataset completo (procesamiento batch)...")
    results = ensemble_model.train_ensemble_models(
        data=data,
        target_col='GAMES_PLAYED_TOTAL',
        test_size=0.2
    )
    
    # Guardar modelos
    ensemble_model.save_ensemble_models('models/ensemble_regression/')
    
    return ensemble_model, results

def example_predictions():
    """
    Ejemplo de cómo hacer predicciones de regresión
    """
    # Cargar modelo entrenado
    model_paths = {
        'tiempo_model': 'path/to/tiempo_model.h5',
        'bet_model': 'path/to/bet_model.h5', 
        'win_model': 'path/to/win_model.h5',
        'tiempo_scaler': 'path/to/tiempo_scaler.pkl',
        'bet_scaler': 'path/to/bet_scaler.pkl',
        'win_scaler': 'path/to/win_scaler.pkl'
    }
    
    ensemble_model = CasinoEnsembleRegressionOptimized(model_paths)
    
    # Cargar modelos ensemble pre-entrenados
    ensemble_model.load_ensemble_models('models/ensemble_regression/')
    
    # Hacer predicciones individuales
    examples = [
        (100, 5, 0),    # $100 inicial, $5 apuesta promedio, cluster 0
        (500, 20, 1),   # $500 inicial, $20 apuesta promedio, cluster 1  
        (50, 2, 2),     # $50 inicial, $2 apuesta promedio, cluster 2
    ]
    
    print("🎮 Ejemplos de predicciones de GAMES_PLAYED_TOTAL:")
    print("-" * 80)
    
    for initial, avg_bet, cluster in examples:
        for model_name in ['MLP', 'RandomForest', 'XGBoost', 'RNN']:
            result = ensemble_model.predict_games_played(
                initial_amount=initial,
                avg_bet=avg_bet,
                cluster=cluster,
                model_name=model_name
            )
            
            if 'error' not in result:
                print(f"\n💰 Sesión: ${initial} inicial, ${avg_bet} apuesta promedio, cluster {cluster}")
                print(f"   🎯 Juegos predichos ({model_name}): {result['games_played_prediction']:.1f}")
                print(f"   📈 Predicciones individuales:")
                print(f"       Tiempo en dispositivo: {result['individual_predictions']['time_on_device']:.1f} min")
                print(f"       Apuesta total: ${result['individual_predictions']['bet_total']:.2f}")
                print(f"       Ganancia total: ${result['individual_predictions']['win_total']:.2f}")

def analyze_model_performance(results):
    """
    Análisis detallado del rendimiento de los modelos
    """
    print("\n" + "="*80)
    print("🔍 ANÁLISIS DETALLADO DEL RENDIMIENTO")
    print("="*80)
    
    for model_name, metrics in results.items():
        print(f"\n📊 {model_name}:")
        print(f"   RMSE: {metrics['rmse']:.4f}")
        print(f"   MAE: {metrics['mae']:.4f}")  
        print(f"   R²: {metrics['r2_score']:.4f}")
        print(f"   MSE: {metrics['mse']:.4f}")
        
        # Análisis específico por modelo
        if 'feature_importance' in metrics:
            print(f"   Top 3 features importantes:")
            top_features = metrics['feature_importance'].head(3)
            for _, row in top_features.iterrows():
                print(f"     - {row['feature']}: {row['importance']:.4f}")


if __name__ == "__main__":
    # Descomentar la opción que prefieras:
    
    # OPCIÓN 1: Entrenamiento rápido con muestra
    # ensemble_model, results = main_train_ensemble_sampled()
    
    # OPCIÓN 2: Entrenamiento completo optimizado  
    # ensemble_model, results = main_train_ensemble_full()
    
    # OPCIÓN 3: Hacer predicciones (requiere modelos entrenados)
    # example_predictions()
    
    # OPCIÓN 4: Análisis de rendimiento
    # analyze_model_performance(results)
    
    print("🎮 Ensemble Regression Model listo para predecir GAMES_PLAYED_TOTAL!")
    print("📖 Lee las instrucciones en los comentarios para comenzar.")

🎮 Ensemble Regression Model listo para predecir GAMES_PLAYED_TOTAL!
📖 Lee las instrucciones en los comentarios para comenzar.


In [5]:
model_paths = {
    'tiempo_model': '../../../models/tiempo_model.h5',
    'tiempo_scaler': '../../../models/tiempo_scaler.pkl',
    'bet_model': '../../../models/bet_model.h5',
    'bet_scaler': '../../../models/bet_scaler.pkl',
    'win_model': '../../../models/win_model.h5',
    'win_scaler': '../../../models/win_scaler.pkl'
}
ensemble_model = CasinoEnsembleRegressionOptimized(model_paths)

# Entrenar con dataset completo (procesamiento batch optimizado)
print("🚀 Entrenamiento con dataset completo (procesamiento batch)...")
results = ensemble_model.train_ensemble_models(
    data=df_data_general,
    target_col='GAMES_PLAYED_TOTAL',
    test_size=0.2
)

2025-09-23 11:31:51.559743: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


✅ Todos los modelos y scalers cargados correctamente
🚀 Entrenamiento con dataset completo (procesamiento batch)...
🚀 Iniciando entrenamiento de modelos ensemble para regresión...
🎯 Target: Predicción de GAMES_PLAYED_TOTAL
📝 Las features time_on_device, bet_total, win_total se generarán desde los modelos pre-entrenados
📋 Columnas disponibles: ['Unnamed: 0.1', 'Unnamed: 0', 'PLAYER_ID', 'DOB', 'GENDER', 'AVG_BET', 'BET_TOTAL', 'INITIAL_AMOUNT', 'INITIAL_TIME', 'FINAL_TIME', 'INITIAL_PROMO_AMOUNT', 'FINAL_AMOUNT', 'FINAL_PROMO_AMOUNT', 'MACHINE_ID', 'WIN_TOTAL', 'GAMES_PLAYED_TOTAL', 'GAMES_WON_TOTAL', 'TIME_ON_DEVICE_SEC', 'PLAYER_LEVEL_ID', 'Casino', 'AVG_BET_std', 'BET_TOTAL_std', 'INITIAL_AMOUNT_std', 'INITIAL_PROMO_AMOUNT_std', 'FINAL_AMOUNT_std', 'FINAL_PROMO_AMOUNT_std', 'WIN_TOTAL_std', 'GAMES_PLAYED_TOTAL_std', 'GAMES_WON_TOTAL_std', 'TIME_ON_DEVICE_SEC_std', 'Edad', 'Rango_Edad', 'Rango_Edad_le', 'Cluster', 'Weekday', 'number_of_day', 'TIME_ON_DEVICE_MIN', 'Hour', 'Weekend', 'Mo

2025-09-23 11:31:53.707776: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


✅ Lote 1 completado. Progreso: 0.5%
📦 Procesando lote 2/210 (1000 sesiones)...
✅ Lote 2 completado. Progreso: 1.0%
📦 Procesando lote 3/210 (1000 sesiones)...
✅ Lote 3 completado. Progreso: 1.4%
📦 Procesando lote 4/210 (1000 sesiones)...
✅ Lote 4 completado. Progreso: 1.9%
📦 Procesando lote 5/210 (1000 sesiones)...
✅ Lote 5 completado. Progreso: 2.4%
📦 Procesando lote 6/210 (1000 sesiones)...
✅ Lote 6 completado. Progreso: 2.9%
📦 Procesando lote 7/210 (1000 sesiones)...
✅ Lote 7 completado. Progreso: 3.3%
📦 Procesando lote 8/210 (1000 sesiones)...
✅ Lote 8 completado. Progreso: 3.8%
📦 Procesando lote 9/210 (1000 sesiones)...
✅ Lote 9 completado. Progreso: 4.3%
📦 Procesando lote 10/210 (1000 sesiones)...
✅ Lote 10 completado. Progreso: 4.8%
📦 Procesando lote 11/210 (1000 sesiones)...
✅ Lote 11 completado. Progreso: 5.3%
📦 Procesando lote 12/210 (1000 sesiones)...
✅ Lote 12 completado. Progreso: 5.7%
📦 Procesando lote 13/210 (1000 sesiones)...
✅ Lote 13 completado. Progreso: 6.2%
📦 Proces

In [6]:
ensemble_model.save_ensemble_models('games_played')

✅ Modelo MLP guardado en: games_played/ensemble_mlp_model.pkl
✅ Modelo RandomForest guardado en: games_played/ensemble_randomforest_model.pkl
✅ Modelo XGBoost guardado en: games_played/ensemble_xgboost_model.pkl
✅ Modelo RNN guardado en: games_played/ensemble_rnn_model.h5
✅ Ensemble scaler guardado en: games_played/ensemble_scaler.pkl
✅ Target scaler guardado en: games_played/target_scaler.pkl
