In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, ElasticNet, HuberRegressor
import xgboost as xgb
from sklearn.compose import TransformedTargetRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_data_general = pd.read_csv('../../../data/data_general.csv')

In [3]:
df_data_general['INITIAL_TIME'] = pd.to_datetime(df_data_general['INITIAL_TIME'])
df_data_general['FINAL_TIME'] = pd.to_datetime(df_data_general['FINAL_TIME'])

df_data_general['INITIAL_TIME'] = df_data_general['INITIAL_TIME'].dt.to_period('D')
df_data_general['INITIAL_TIME'] = df_data_general['INITIAL_TIME'].dt.to_timestamp()


df_data_general['FINAL_TIME'] = df_data_general['FINAL_TIME'].dt.to_period('D')
df_data_general['FINAL_TIME'] = df_data_general['FINAL_TIME'].dt.to_timestamp()

df_data_general['Weekday']= df_data_general['INITIAL_TIME'].dt.strftime('%A')
df_data_general['number_of_day'] = df_data_general['INITIAL_TIME'].dt.day_of_week

df_data_general['TIME_ON_DEVICE_MIN'] = df_data_general['TIME_ON_DEVICE_SEC'] / 60

df_data_general['Hour'] = df_data_general['INITIAL_TIME'].dt.hour
df_data_general['Weekday'] = df_data_general['INITIAL_TIME'].dt.weekday   # 0=Lunes, 6=Domingo
df_data_general['Weekend'] = (df_data_general['Weekday'] >= 5).astype(int)
df_data_general['Month'] = df_data_general['INITIAL_TIME'].dt.month

df_data_general = df_data_general[df_data_general['TIME_ON_DEVICE_MIN'] < 600 ]

df_data_general = df_data_general[df_data_general['WIN_TOTAL'] > 0]
df_data_general['NET_SPEND'] = df_data_general['FINAL_AMOUNT'] - df_data_general['INITIAL_AMOUNT']
df_data_general = df_data_general[df_data_general['NET_SPEND'] < 10000 ]

In [15]:
def train_cascade_models_correctly(df):
    """
    Entrena los 4 modelos en cascada usando las predicciones reales
    de los modelos anteriores, no los valores del dataframe.
    """
    
    # Preparar datos base
    X_base = df[['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month']]
    y_tiempo = df['TIME_ON_DEVICE_MIN']
    y_bet = df['BET_TOTAL'] 
    y_win = df['WIN_TOTAL']
    y_final = df['FINAL_AMOUNT']
    
    # Split train/test
    X_base_train, X_base_test, y_tiempo_train, y_tiempo_test = train_test_split(
        X_base, y_tiempo, test_size=0.3, random_state=42
    )
    
    # También dividir los otros targets manteniendo los mismos índices
    y_bet_train = y_bet.loc[X_base_train.index]
    y_bet_test = y_bet.loc[X_base_test.index]
    y_win_train = y_win.loc[X_base_train.index]
    y_win_test = y_win.loc[X_base_test.index]
    y_final_train = y_final.loc[X_base_train.index]
    y_final_test = y_final.loc[X_base_test.index]
    
    print("=== ENTRENANDO MODELO 1: TIEMPO ===")
    # Modelo 1: Tiempo en máquina
    model_tiempo = RandomForestRegressor(random_state=42)
    model_tiempo.fit(X_base_train, y_tiempo_train)
    
    # Generar predicciones de tiempo para usar en el siguiente modelo
    tiempo_pred_train = model_tiempo.predict(X_base_train)
    tiempo_pred_test = model_tiempo.predict(X_base_test)
    
    r2_tiempo = r2_score(y_tiempo_test, tiempo_pred_test)
    print(f"R² Tiempo: {r2_tiempo:.4f}")
    
    print("\n=== ENTRENANDO MODELO 2: BET TOTAL ===")
    # Modelo 2: Bet total (usando predicción de tiempo, NO valor real)
    X_bet_train = pd.concat([
        X_base_train[['INITIAL_AMOUNT','AVG_BET','Cluster']].reset_index(drop=True),
        pd.Series(tiempo_pred_train, name='TIME_ON_DEVICE_MIN')
    ], axis=1)
    
    X_bet_test = pd.concat([
        X_base_test[['INITIAL_AMOUNT','AVG_BET','Cluster']].reset_index(drop=True),
        pd.Series(tiempo_pred_test, name='TIME_ON_DEVICE_MIN')
    ], axis=1)
    
    model_bet = RandomForestRegressor(random_state=42)
    model_bet.fit(X_bet_train, y_bet_train.values)
    
    # Generar predicciones de bet para usar en el siguiente modelo
    bet_pred_train = model_bet.predict(X_bet_train)
    bet_pred_test = model_bet.predict(X_bet_test)
    
    r2_bet = r2_score(y_bet_test, bet_pred_test)
    print(f"R² Bet Total: {r2_bet:.4f}")
    
    print("\n=== ENTRENANDO MODELO 3: WIN TOTAL ===")
    # Modelo 3: Win total (usando predicciones de tiempo y bet)
    X_win_train = pd.concat([
        X_base_train[['INITIAL_AMOUNT','AVG_BET','Cluster']].reset_index(drop=True),
        pd.Series(tiempo_pred_train, name='TIME_ON_DEVICE_MIN'),
        pd.Series(bet_pred_train, name='BET_TOTAL')
    ], axis=1)
    
    X_win_test = pd.concat([
        X_base_test[['INITIAL_AMOUNT','AVG_BET','Cluster']].reset_index(drop=True),
        pd.Series(tiempo_pred_test, name='TIME_ON_DEVICE_MIN'),
        pd.Series(bet_pred_test, name='BET_TOTAL')
    ], axis=1)
    
    model_win = RandomForestRegressor(random_state=42)
    model_win.fit(X_win_train, y_win_train.values)
    
    # Generar predicciones de win para usar en el siguiente modelo
    win_pred_train = model_win.predict(X_win_train)
    win_pred_test = model_win.predict(X_win_test)
    
    r2_win = r2_score(y_win_test, win_pred_test)
    print(f"R² Win Total: {r2_win:.4f}")
    
    print("\n=== ENTRENANDO MODELO 4: FINAL AMOUNT ===")
    # Modelo 4: Final amount (usando predicciones de bet y win)
    X_final_train = pd.concat([
        X_base_train[['INITIAL_AMOUNT','AVG_BET','Cluster']].reset_index(drop=True),
        pd.Series(bet_pred_train, name='BET_TOTAL'),
        pd.Series(win_pred_train, name='WIN_TOTAL')
    ], axis=1)
    
    X_final_test = pd.concat([
        X_base_test[['INITIAL_AMOUNT','AVG_BET','Cluster']].reset_index(drop=True),
        pd.Series(bet_pred_test, name='BET_TOTAL'),
        pd.Series(win_pred_test, name='WIN_TOTAL')
    ], axis=1)
    
    model_final = RandomForestRegressor(random_state=42)
    model_final.fit(X_final_train, y_final_train.values)
    
    # Evaluar modelo final
    final_pred_test = model_final.predict(X_final_test)
    r2_final = r2_score(y_final_test, final_pred_test)
    print(f"R² Final Amount: {r2_final:.4f}")
    
    print("\n=== RESUMEN DE RESULTADOS ===")
    print(f"Tiempo en máquina: {r2_tiempo:.4f}")
    print(f"Bet total: {r2_bet:.4f}")
    print(f"Win total: {r2_win:.4f}")
    print(f"Final amount: {r2_final:.4f}")
    
    # Guardar modelos
    models = {
        'tiempo': model_tiempo,
        'bet': model_bet, 
        'win': model_win,
        'final': model_final
    }
    
    return models, (X_base_test, y_tiempo_test, y_bet_test, y_win_test, y_final_test)


In [16]:

def predict_cascade(models, initial_amount, avg_bet, cluster, weekday, weekend, month):
    """
    Función para hacer predicciones en producción usando el pipeline entrenado
    """
    # Predicción 1: Tiempo
    X_tiempo = np.array([[initial_amount, avg_bet, cluster, weekday, weekend, month]])
    tiempo_pred = models['tiempo'].predict(X_tiempo)[0]
    
    # Predicción 2: Bet total
    X_bet = np.array([[initial_amount, avg_bet, cluster, tiempo_pred]])
    bet_pred = models['bet'].predict(X_bet)[0]
    
    # Predicción 3: Win total  
    X_win = np.array([[initial_amount, avg_bet, cluster, tiempo_pred, bet_pred]])
    win_pred = models['win'].predict(X_win)[0]
    
    # Predicción 4: Final amount
    X_final = np.array([[initial_amount, avg_bet, cluster, bet_pred, win_pred]])
    final_pred = models['final'].predict(X_final)[0]
    
    return {
        'tiempo_maquina': tiempo_pred,
        'bet_total': bet_pred, 
        'win_total': win_pred,
        'final_amount': final_pred
    }


In [17]:
models, results = train_cascade_models_correctly(df_data_general)

=== ENTRENANDO MODELO 1: TIEMPO ===
R² Tiempo: 0.5965

=== ENTRENANDO MODELO 2: BET TOTAL ===
R² Bet Total: 0.6579

=== ENTRENANDO MODELO 3: WIN TOTAL ===
R² Win Total: 0.1870

=== ENTRENANDO MODELO 4: FINAL AMOUNT ===
R² Final Amount: -0.0344

=== RESUMEN DE RESULTADOS ===
Tiempo en máquina: 0.5965
Bet total: 0.6579
Win total: 0.1870
Final amount: -0.0344


In [20]:
class ImprovedCascadeModels:
    def __init__(self):
        self.scalers = {}
        self.models = {}
        self.target_transformers = {}
        
    def create_engineered_features(self, df, tiempo_pred=None, bet_pred=None):
        """Crear features adicionales para mejorar el rendimiento"""
        features = df.copy()
        
        # Features de ratio
        features['bet_to_initial_ratio'] = df['AVG_BET'] / (df['INITIAL_AMOUNT'] + 1)
        features['initial_amount_log'] = np.log1p(df['INITIAL_AMOUNT'])
        features['avg_bet_log'] = np.log1p(df['AVG_BET'])
        
        # Features de interacción
        features['amount_x_cluster'] = df['INITIAL_AMOUNT'] * df['Cluster']
        features['bet_x_cluster'] = df['AVG_BET'] * df['Cluster']
        
        # Si tenemos predicciones previas, agregar features derivados
        if tiempo_pred is not None:
            features['tiempo_pred'] = tiempo_pred
            features['bet_per_minute'] = bet_pred / (tiempo_pred + 1) if bet_pred is not None else 0
            features['tiempo_x_initial'] = tiempo_pred * df['INITIAL_AMOUNT']
        
        if bet_pred is not None:
            features['bet_pred'] = bet_pred
            features['bet_efficiency'] = bet_pred / (df['INITIAL_AMOUNT'] + 1)
        
        return features

    def train_improved_models(self, df):
        """Entrenar modelos con mejoras en features y algoritmos"""
        
        # Preparar datos base
        X_base = df[['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month']]
        y_tiempo = df['TIME_ON_DEVICE_MIN']
        y_bet = df['BET_TOTAL'] 
        y_win = df['WIN_TOTAL']
        y_final = df['FINAL_AMOUNT']
        
        # Split train/test
        X_base_train, X_base_test, y_tiempo_train, y_tiempo_test = train_test_split(
            X_base, y_tiempo, test_size=0.2, random_state=42, stratify=df['Cluster']
        )
        
        # Obtener índices para otros targets
        train_idx, test_idx = X_base_train.index, X_base_test.index
        y_bet_train, y_bet_test = y_bet.loc[train_idx], y_bet.loc[test_idx]
        y_win_train, y_win_test = y_win.loc[train_idx], y_win.loc[test_idx]
        y_final_train, y_final_test = y_final.loc[train_idx], y_final.loc[test_idx]
        
        print("=== MODELO 1: TIEMPO (MEJORADO) ===")
        # Features engineered para tiempo
        X_tiempo_train = self.create_engineered_features(X_base_train)
        X_tiempo_test = self.create_engineered_features(X_base_test)
        
        # Scaler para features
        self.scalers['tiempo'] = RobustScaler()
        X_tiempo_train_scaled = self.scalers['tiempo'].fit_transform(X_tiempo_train)
        X_tiempo_test_scaled = self.scalers['tiempo'].transform(X_tiempo_test)
        
        # Ensemble de modelos para tiempo
        rf_tiempo = RandomForestRegressor(n_estimators=300, max_depth=20, min_samples_split=5, random_state=42)
        xgb_tiempo = xgb.XGBRegressor(n_estimators=200, max_depth=8, learning_rate=0.1, random_state=42)
        
        rf_tiempo.fit(X_tiempo_train_scaled, y_tiempo_train)
        xgb_tiempo.fit(X_tiempo_train_scaled, y_tiempo_train)
        
        # Predicciones ensemble
        rf_pred_train = rf_tiempo.predict(X_tiempo_train_scaled)
        xgb_pred_train = xgb_tiempo.predict(X_tiempo_train_scaled)
        tiempo_pred_train = (rf_pred_train + xgb_pred_train) / 2
        
        rf_pred_test = rf_tiempo.predict(X_tiempo_test_scaled)
        xgb_pred_test = xgb_tiempo.predict(X_tiempo_test_scaled)
        tiempo_pred_test = (rf_pred_test + xgb_pred_test) / 2
        
        self.models['tiempo'] = {'rf': rf_tiempo, 'xgb': xgb_tiempo}
        r2_tiempo = r2_score(y_tiempo_test, tiempo_pred_test)
        print(f"R² Tiempo: {r2_tiempo:.4f}")
        
        print("\n=== MODELO 2: BET TOTAL (MEJORADO) ===")
        # Features para bet con predicción de tiempo
        df_train_bet = X_base_train.copy()
        df_train_bet['tiempo_pred'] = tiempo_pred_train
        df_test_bet = X_base_test.copy()
        df_test_bet['tiempo_pred'] = tiempo_pred_test
        
        X_bet_train = self.create_engineered_features(df_train_bet, tiempo_pred_train)
        X_bet_test = self.create_engineered_features(df_test_bet, tiempo_pred_test)
        
        # Scaler para bet
        self.scalers['bet'] = RobustScaler()
        X_bet_train_scaled = self.scalers['bet'].fit_transform(X_bet_train)
        X_bet_test_scaled = self.scalers['bet'].transform(X_bet_test)
        
        # Modelo para bet con transformación de target
        base_model_bet = xgb.XGBRegressor(n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42)
        self.models['bet'] = TransformedTargetRegressor(
            regressor=base_model_bet,
            transformer=StandardScaler()
        )
        
        self.models['bet'].fit(X_bet_train_scaled, y_bet_train)
        bet_pred_train = self.models['bet'].predict(X_bet_train_scaled)
        bet_pred_test = self.models['bet'].predict(X_bet_test_scaled)
        
        r2_bet = r2_score(y_bet_test, bet_pred_test)
        print(f"R² Bet Total: {r2_bet:.4f}")
        
        print("\n=== MODELO 3: WIN TOTAL (MEJORADO) ===")
        # Features para win con predicciones anteriores
        df_train_win = X_base_train.copy()
        df_train_win['tiempo_pred'] = tiempo_pred_train
        df_train_win['bet_pred'] = bet_pred_train
        df_test_win = X_base_test.copy()
        df_test_win['tiempo_pred'] = tiempo_pred_test
        df_test_win['bet_pred'] = bet_pred_test
        
        X_win_train = self.create_engineered_features(df_train_win, tiempo_pred_train, bet_pred_train)
        X_win_test = self.create_engineered_features(df_test_win, tiempo_pred_test, bet_pred_test)
        
        # Features adicionales específicas para WIN
        X_win_train['win_potential'] = X_win_train['bet_pred'] * 0.95  # RTP aproximado
        X_win_train['risk_score'] = X_win_train['bet_pred'] / (X_win_train['INITIAL_AMOUNT'] + 1)
        X_win_test['win_potential'] = X_win_test['bet_pred'] * 0.95
        X_win_test['risk_score'] = X_win_test['bet_pred'] / (X_win_test['INITIAL_AMOUNT'] + 1)
        
        # Scaler para win
        self.scalers['win'] = RobustScaler()
        X_win_train_scaled = self.scalers['win'].fit_transform(X_win_train)
        X_win_test_scaled = self.scalers['win'].transform(X_win_test)
        
        # Múltiples modelos para WIN (el más problemático)
        models_win = {
            'xgb': xgb.XGBRegressor(
                n_estimators=400, 
                max_depth=12, 
                learning_rate=0.03,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42
            ),
            'rf': RandomForestRegressor(
                n_estimators=400,
                max_depth=25,
                min_samples_split=3,
                min_samples_leaf=2,
                random_state=42
            ),
            'gbr': GradientBoostingRegressor(
                n_estimators=300,
                max_depth=8,
                learning_rate=0.05,
                random_state=42
            )
        }
        
        win_predictions_train = []
        win_predictions_test = []
        
        for name, model in models_win.items():
            model.fit(X_win_train_scaled, y_win_train)
            win_predictions_train.append(model.predict(X_win_train_scaled))
            win_predictions_test.append(model.predict(X_win_test_scaled))
        
        # Ensemble con pesos optimizados
        win_pred_train = np.average(win_predictions_train, axis=0, weights=[0.5, 0.3, 0.2])
        win_pred_test = np.average(win_predictions_test, axis=0, weights=[0.5, 0.3, 0.2])
        
        self.models['win'] = models_win
        r2_win = r2_score(y_win_test, win_pred_test)
        print(f"R² Win Total: {r2_win:.4f}")
        
        print("\n=== MODELO 4: FINAL AMOUNT (MEJORADO) ===")
        # Features para final amount
        df_train_final = X_base_train.copy()
        df_train_final['bet_pred'] = bet_pred_train
        df_train_final['win_pred'] = win_pred_train
        df_test_final = X_base_test.copy()
        df_test_final['bet_pred'] = bet_pred_test  
        df_test_final['win_pred'] = win_pred_test
        
        X_final_train = self.create_engineered_features(df_train_final, bet_pred=bet_pred_train)
        X_final_test = self.create_engineered_features(df_test_final, bet_pred=bet_pred_test)
        
        # Features específicas para FINAL AMOUNT
        X_final_train['net_result'] = X_final_train['win_pred'] - X_final_train['bet_pred']
        X_final_train['return_rate'] = X_final_train['win_pred'] / (X_final_train['bet_pred'] + 1)
        X_final_train['final_estimate'] = X_final_train['INITIAL_AMOUNT'] + X_final_train['net_result']
        
        X_final_test['net_result'] = X_final_test['win_pred'] - X_final_test['bet_pred']
        X_final_test['return_rate'] = X_final_test['win_pred'] / (X_final_test['bet_pred'] + 1)
        X_final_test['final_estimate'] = X_final_test['INITIAL_AMOUNT'] + X_final_test['net_result']
        
        # Scaler para final
        self.scalers['final'] = RobustScaler()
        X_final_train_scaled = self.scalers['final'].fit_transform(X_final_train)
        X_final_test_scaled = self.scalers['final'].transform(X_final_test)
        
        # Modelo robusto para final amount
        self.models['final'] = xgb.XGBRegressor(
            n_estimators=500,
            max_depth=15,
            learning_rate=0.02,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42
        )
        
        self.models['final'].fit(X_final_train_scaled, y_final_train)
        final_pred_test = self.models['final'].predict(X_final_test_scaled)
        
        r2_final = r2_score(y_final_test, final_pred_test)
        print(f"R² Final Amount: {r2_final:.4f}")
        
        # Métricas adicionales
        mae_final = mean_absolute_error(y_final_test, final_pred_test)
        rmse_final = np.sqrt(mean_squared_error(y_final_test, final_pred_test))
        print(f"MAE Final Amount: {mae_final:.2f}")
        print(f"RMSE Final Amount: {rmse_final:.2f}")
        
        print("\n=== RESUMEN FINAL ===")
        print(f"Tiempo en máquina: {r2_tiempo:.4f}")
        print(f"Bet total: {r2_bet:.4f}")
        print(f"Win total: {r2_win:.4f}")
        print(f"Final amount: {r2_final:.4f}")
        
        return {
            'tiempo': r2_tiempo,
            'bet': r2_bet,
            'win': r2_win,
            'final': r2_final
        }

    def predict_improved(self, initial_amount, avg_bet, cluster, weekday, weekend, month):
        """Predicción usando los modelos mejorados"""
        
        # Base features
        base_features = pd.DataFrame([[initial_amount, avg_bet, cluster, weekday, weekend, month]], 
                                   columns=['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month'])
        
        # Predicción 1: Tiempo
        X_tiempo = self.create_engineered_features(base_features)
        X_tiempo_scaled = self.scalers['tiempo'].transform(X_tiempo)
        
        rf_pred = self.models['tiempo']['rf'].predict(X_tiempo_scaled)[0]
        xgb_pred = self.models['tiempo']['xgb'].predict(X_tiempo_scaled)[0]
        tiempo_pred = (rf_pred + xgb_pred) / 2
        
        # Predicción 2: Bet
        base_bet = base_features.copy()
        base_bet['tiempo_pred'] = tiempo_pred
        X_bet = self.create_engineered_features(base_bet, tiempo_pred)
        X_bet_scaled = self.scalers['bet'].transform(X_bet)
        bet_pred = self.models['bet'].predict(X_bet_scaled)[0]
        
        # Predicción 3: Win
        base_win = base_features.copy()
        base_win['tiempo_pred'] = tiempo_pred
        base_win['bet_pred'] = bet_pred
        X_win = self.create_engineered_features(base_win, tiempo_pred, bet_pred)
        
        # Features adicionales para win
        X_win['win_potential'] = X_win['bet_pred'] * 0.95
        X_win['risk_score'] = X_win['bet_pred'] / (X_win['INITIAL_AMOUNT'] + 1)
        
        X_win_scaled = self.scalers['win'].transform(X_win)
        
        # Ensemble para win
        win_preds = []
        for model in self.models['win'].values():
            win_preds.append(model.predict(X_win_scaled)[0])
        win_pred = np.average(win_preds, weights=[0.5, 0.3, 0.2])
        
        # Predicción 4: Final
        base_final = base_features.copy()
        base_final['bet_pred'] = bet_pred
        base_final['win_pred'] = win_pred
        X_final = self.create_engineered_features(base_final, bet_pred=bet_pred)
        
        # Features adicionales para final
        X_final['net_result'] = X_final['win_pred'] - X_final['bet_pred']
        X_final['return_rate'] = X_final['win_pred'] / (X_final['bet_pred'] + 1)
        X_final['final_estimate'] = X_final['INITIAL_AMOUNT'] + X_final['net_result']
        
        X_final_scaled = self.scalers['final'].transform(X_final)
        final_pred = self.models['final'].predict(X_final_scaled)[0]
        
        return {
            'tiempo_maquina': tiempo_pred,
            'bet_total': bet_pred,
            'win_total': win_pred,
            'final_amount': final_pred
        }


In [21]:
def diagnose_model_issues(df):
    """Analizar posibles problemas en los datos"""
    print("=== DIAGNÓSTICO DE DATOS ===")
    
    # Distribución de targets
    targets = ['TIME_ON_DEVICE_MIN', 'BET_TOTAL', 'WIN_TOTAL', 'FINAL_AMOUNT']
    for target in targets:
        print(f"\n{target}:")
        print(f"  Mean: {df[target].mean():.2f}")
        print(f"  Std: {df[target].std():.2f}")
        print(f"  Min: {df[target].min():.2f}")
        print(f"  Max: {df[target].max():.2f}")
        print(f"  Zeros: {(df[target] == 0).sum()}")
        print(f"  Negatives: {(df[target] < 0).sum()}")
    
    # Correlaciones
    print(f"\nCorrelaciones con WIN_TOTAL:")
    corr_win = df[['INITIAL_AMOUNT', 'AVG_BET', 'TIME_ON_DEVICE_MIN', 'BET_TOTAL', 'WIN_TOTAL']].corr()['WIN_TOTAL']
    for col, corr in corr_win.items():
        if col != 'WIN_TOTAL':
            print(f"  {col}: {corr:.3f}")
    
    print(f"\nCorrelaciones con FINAL_AMOUNT:")
    corr_final = df[['INITIAL_AMOUNT', 'AVG_BET', 'BET_TOTAL', 'WIN_TOTAL', 'FINAL_AMOUNT']].corr()['FINAL_AMOUNT']
    for col, corr in corr_final.items():
        if col != 'FINAL_AMOUNT':
            print(f"  {col}: {corr:.3f}")


In [22]:
diagnose_model_issues(df_data_general)

=== DIAGNÓSTICO DE DATOS ===

TIME_ON_DEVICE_MIN:
  Mean: 15.88
  Std: 24.27
  Min: 0.08
  Max: 537.87
  Zeros: 0
  Negatives: 0

BET_TOTAL:
  Mean: 856.40
  Std: 1879.99
  Min: 0.00
  Max: 134666.60
  Zeros: 8
  Negatives: 0

WIN_TOTAL:
  Mean: 795.96
  Std: 2281.02
  Min: 0.02
  Max: 461502.00
  Zeros: 0
  Negatives: 0

FINAL_AMOUNT:
  Mean: 215.21
  Std: 533.06
  Min: 0.00
  Max: 10601.67
  Zeros: 8734
  Negatives: 0

Correlaciones con WIN_TOTAL:
  INITIAL_AMOUNT: 0.191
  AVG_BET: 0.218
  TIME_ON_DEVICE_MIN: 0.524
  BET_TOTAL: 0.870

Correlaciones con FINAL_AMOUNT:
  INITIAL_AMOUNT: 0.312
  AVG_BET: 0.145
  BET_TOTAL: 0.297
  WIN_TOTAL: 0.449


In [23]:
cascade = ImprovedCascadeModels()


In [24]:
results = cascade.train_improved_models(df_data_general)


=== MODELO 1: TIEMPO (MEJORADO) ===
R² Tiempo: 0.6267

=== MODELO 2: BET TOTAL (MEJORADO) ===
R² Bet Total: 0.6330

=== MODELO 3: WIN TOTAL (MEJORADO) ===
R² Win Total: 0.4351

=== MODELO 4: FINAL AMOUNT (MEJORADO) ===
R² Final Amount: -0.0552
MAE Final Amount: 219.26
RMSE Final Amount: 537.92

=== RESUMEN FINAL ===
Tiempo en máquina: 0.6267
Bet total: 0.6330
Win total: 0.4351
Final amount: -0.0552


In [25]:
class FixedFinalAmountModel:
    def __init__(self):
        self.scalers = {}
        self.models = {}
        self.zero_classifier = None
        
    def preprocess_targets(self, df):
        """Preprocesamiento específico para los targets problemáticos"""
        df_clean = df.copy()
        
        # 1. Limpiar outliers extremos
        def remove_outliers(series, factor=3):
            Q1 = series.quantile(0.25)
            Q3 = series.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - factor * IQR
            upper_bound = Q3 + factor * IQR
            return series.clip(lower_bound, upper_bound)
        
        # Limpiar WIN_TOTAL y FINAL_AMOUNT
        df_clean['WIN_TOTAL'] = remove_outliers(df_clean['WIN_TOTAL'])
        df_clean['BET_TOTAL'] = remove_outliers(df_clean['BET_TOTAL'])
        df_clean['FINAL_AMOUNT'] = remove_outliers(df_clean['FINAL_AMOUNT'])
        
        # 2. Verificar consistencia de FINAL_AMOUNT
        expected_final = df_clean['INITIAL_AMOUNT'] + df_clean['WIN_TOTAL'] - df_clean['BET_TOTAL']
        actual_final = df_clean['FINAL_AMOUNT']
        
        # Si hay inconsistencias, usar la fórmula lógica
        inconsistent_mask = abs(expected_final - actual_final) > 10  # threshold de 10
        print(f"Registros inconsistentes en FINAL_AMOUNT: {inconsistent_mask.sum()}")
        
        if inconsistent_mask.sum() > len(df_clean) * 0.1:  # Si más del 10% es inconsistente
            print("Recalculando FINAL_AMOUNT usando la fórmula: INITIAL + WIN - BET")
            df_clean['FINAL_AMOUNT'] = expected_final
        
        return df_clean
    
    def create_final_amount_features(self, df, bet_pred, win_pred):
        """Features específicos para predecir FINAL_AMOUNT"""
        features = df[['INITIAL_AMOUNT', 'AVG_BET', 'Cluster']].copy()
        
        # Features básicos
        features['bet_pred'] = bet_pred
        features['win_pred'] = win_pred
        
        # Features derivados más específicos
        features['net_result'] = win_pred - bet_pred
        features['theoretical_final'] = df['INITIAL_AMOUNT'] + features['net_result']
        
        # Features de ratio y eficiencia
        features['win_rate'] = win_pred / (bet_pred + 1)
        features['loss_rate'] = (bet_pred - win_pred) / (bet_pred + 1)
        features['balance_change_pct'] = features['net_result'] / (df['INITIAL_AMOUNT'] + 1)
        
        # Features de cluster y comportamiento
        features['cluster_risk'] = df['Cluster'] * features['loss_rate']
        features['initial_bet_ratio'] = df['AVG_BET'] / (df['INITIAL_AMOUNT'] + 1)
        
        # Features de threshold
        features['is_winning'] = (win_pred > bet_pred).astype(int)
        features['big_loss'] = ((bet_pred - win_pred) > df['INITIAL_AMOUNT'] * 0.5).astype(int)
        
        return features
    
    def train_with_zero_handling(self, df):
        """Entrenar con manejo específico de zeros en FINAL_AMOUNT"""
        
        # Preprocesar datos
        df_clean = self.preprocess_targets(df)
        
        # Separar casos de FINAL_AMOUNT = 0 vs > 0
        zero_mask = df_clean['FINAL_AMOUNT'] == 0
        print(f"Registros con FINAL_AMOUNT = 0: {zero_mask.sum()} ({zero_mask.sum()/len(df_clean)*100:.1f}%)")
        
        # Preparar datos base (usando modelos existentes para bet y win)
        X_base = df_clean[['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month']]
        y_tiempo = df_clean['TIME_ON_DEVICE_MIN']
        y_bet = df_clean['BET_TOTAL'] 
        y_win = df_clean['WIN_TOTAL']
        y_final = df_clean['FINAL_AMOUNT']
        
        # Split manteniendo la proporción de zeros
        X_base_train, X_base_test, y_final_train, y_final_test = train_test_split(
            X_base, y_final, test_size=0.2, random_state=42, 
            stratify=(y_final == 0).astype(int)  # Estratificar por zeros
        )
        
        # Obtener índices para otros targets
        train_idx, test_idx = X_base_train.index, X_base_test.index
        
        # Entrenar modelos básicos primero (simplificado)
        print("Entrenando modelos base para obtener predicciones...")
        
        # Modelo tiempo
        model_tiempo = xgb.XGBRegressor(n_estimators=200, max_depth=8, random_state=42)
        model_tiempo.fit(X_base_train, y_tiempo.loc[train_idx])
        tiempo_pred_train = model_tiempo.predict(X_base_train)
        tiempo_pred_test = model_tiempo.predict(X_base_test)
        
        # Modelo bet
        X_bet_train = np.column_stack([X_base_train[['INITIAL_AMOUNT','AVG_BET','Cluster']], tiempo_pred_train])
        X_bet_test = np.column_stack([X_base_test[['INITIAL_AMOUNT','AVG_BET','Cluster']], tiempo_pred_test])
        model_bet = xgb.XGBRegressor(n_estimators=200, max_depth=8, random_state=42)
        model_bet.fit(X_bet_train, y_bet.loc[train_idx])
        bet_pred_train = model_bet.predict(X_bet_train)
        bet_pred_test = model_bet.predict(X_bet_test)
        
        # Modelo win mejorado
        X_win_train = np.column_stack([X_base_train[['INITIAL_AMOUNT','AVG_BET','Cluster']], 
                                      tiempo_pred_train, bet_pred_train])
        X_win_test = np.column_stack([X_base_test[['INITIAL_AMOUNT','AVG_BET','Cluster']], 
                                     tiempo_pred_test, bet_pred_test])
        model_win = xgb.XGBRegressor(n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42)
        model_win.fit(X_win_train, y_win.loc[train_idx])
        win_pred_train = model_win.predict(X_win_train)
        win_pred_test = model_win.predict(X_win_test)
        
        print(f"R² Win mejorado: {r2_score(y_win.loc[test_idx], win_pred_test):.4f}")
        
        # Ahora entrenar FINAL_AMOUNT con estrategias múltiples
        print("\n=== ENTRENANDO FINAL_AMOUNT CON MÚLTIPLES ESTRATEGIAS ===")
        
        # Crear features específicos
        X_final_train = self.create_final_amount_features(X_base_train, bet_pred_train, win_pred_train)
        X_final_test = self.create_final_amount_features(X_base_test, bet_pred_test, win_pred_test)
        
        # Scaler
        self.scalers['final'] = RobustScaler()
        X_final_train_scaled = self.scalers['final'].fit_transform(X_final_train)
        X_final_test_scaled = self.scalers['final'].transform(X_final_test)
        
        # ESTRATEGIA 1: Modelo directo con fórmula teórica como baseline
        theoretical_final_train = X_final_train['theoretical_final']
        theoretical_final_test = X_final_test['theoretical_final']
        r2_theoretical = r2_score(y_final_test, theoretical_final_test)
        print(f"R² usando fórmula teórica: {r2_theoretical:.4f}")
        
        # ESTRATEGIA 2: Modelo híbrido (combinación de fórmula + ML)
        # Calcular residuos de la fórmula teórica
        residuals_train = y_final_train - theoretical_final_train
        residuals_test = y_final_test - theoretical_final_test
        
        # Entrenar modelo para predecir los residuos
        residual_model = xgb.XGBRegressor(
            n_estimators=400,
            max_depth=12,
            learning_rate=0.03,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42
        )
        
        residual_model.fit(X_final_train_scaled, residuals_train)
        residuals_pred_test = residual_model.predict(X_final_test_scaled)
        
        # Predicción híbrida
        final_pred_hybrid = theoretical_final_test + residuals_pred_test
        r2_hybrid = r2_score(y_final_test, final_pred_hybrid)
        print(f"R² modelo híbrido: {r2_hybrid:.4f}")
        
        # ESTRATEGIA 3: Modelo puro de ML con features engineered
        pure_ml_model = xgb.XGBRegressor(
            n_estimators=500,
            max_depth=15,
            learning_rate=0.02,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_alpha=0.2,
            reg_lambda=0.2,
            random_state=42
        )
        
        pure_ml_model.fit(X_final_train_scaled, y_final_train)
        final_pred_ml = pure_ml_model.predict(X_final_test_scaled)
        r2_ml = r2_score(y_final_test, final_pred_ml)
        print(f"R² modelo ML puro: {r2_ml:.4f}")
        
        # ESTRATEGIA 4: Ensemble de los tres enfoques
        weights = [0.3, 0.4, 0.3]  # theoretical, hybrid, pure_ml
        final_pred_ensemble = (weights[0] * theoretical_final_test + 
                              weights[1] * final_pred_hybrid + 
                              weights[2] * final_pred_ml)
        
        r2_ensemble = r2_score(y_final_test, final_pred_ensemble)
        mae_ensemble = mean_absolute_error(y_final_test, final_pred_ensemble)
        rmse_ensemble = np.sqrt(mean_squared_error(y_final_test, final_pred_ensemble))
        
        print(f"R² ensemble final: {r2_ensemble:.4f}")
        print(f"MAE ensemble: {mae_ensemble:.2f}")
        print(f"RMSE ensemble: {rmse_ensemble:.2f}")
        
        # Guardar los modelos
        self.models = {
            'tiempo': model_tiempo,
            'bet': model_bet,
            'win': model_win,
            'residual': residual_model,
            'pure_ml': pure_ml_model,
            'weights': weights
        }
        
        # Análisis de errores por segmento
        print("\n=== ANÁLISIS DE ERRORES POR SEGMENTO ===")
        errors = abs(y_final_test - final_pred_ensemble)
        
        # Por rango de FINAL_AMOUNT
        for i, (low, high) in enumerate([(0, 0), (0, 100), (100, 500), (500, float('inf'))]):
            if low == 0 and high == 0:
                mask = y_final_test == 0
                print(f"FINAL_AMOUNT = 0: MAE = {errors[mask].mean():.2f}, Count = {mask.sum()}")
            else:
                mask = (y_final_test > low) & (y_final_test <= high)
                if mask.sum() > 0:
                    print(f"FINAL_AMOUNT {low}-{high}: MAE = {errors[mask].mean():.2f}, Count = {mask.sum()}")
        
        return {
            'r2_theoretical': r2_theoretical,
            'r2_hybrid': r2_hybrid,
            'r2_ml': r2_ml,
            'r2_ensemble': r2_ensemble,
            'mae': mae_ensemble,
            'rmse': rmse_ensemble
        }
    
    def predict_final_amount(self, initial_amount, avg_bet, cluster, weekday, weekend, month):
        """Predicción usando el modelo ensemble"""
        
        # Predicciones base
        X_base = np.array([[initial_amount, avg_bet, cluster, weekday, weekend, month]])
        tiempo_pred = self.models['tiempo'].predict(X_base)[0]
        
        X_bet = np.array([[initial_amount, avg_bet, cluster, tiempo_pred]])
        bet_pred = self.models['bet'].predict(X_bet)[0]
        
        X_win = np.array([[initial_amount, avg_bet, cluster, tiempo_pred, bet_pred]])
        win_pred = self.models['win'].predict(X_win)[0]
        
        # Features para final amount
        df_temp = pd.DataFrame([[initial_amount, avg_bet, cluster]], 
                              columns=['INITIAL_AMOUNT', 'AVG_BET', 'Cluster'])
        X_final = self.create_final_amount_features(df_temp, bet_pred, win_pred)
        X_final_scaled = self.scalers['final'].transform(X_final)
        
        # Tres predicciones
        theoretical = X_final['theoretical_final'].iloc[0]
        residual = self.models['residual'].predict(X_final_scaled)[0]
        hybrid = theoretical + residual
        pure_ml = self.models['pure_ml'].predict(X_final_scaled)[0]
        
        # Ensemble
        weights = self.models['weights']
        final_pred = weights[0] * theoretical + weights[1] * hybrid + weights[2] * pure_ml
        
        return {
            'tiempo_maquina': tiempo_pred,
            'bet_total': bet_pred,
            'win_total': win_pred,
            'final_amount': final_pred,
            'final_components': {
                'theoretical': theoretical,
                'hybrid': hybrid,
                'pure_ml': pure_ml
            }
        }


In [26]:
def validate_data_consistency(df):
    """Validar si FINAL_AMOUNT sigue la lógica esperada"""
    print("=== VALIDACIÓN DE CONSISTENCIA DE DATOS ===")
    
    expected = df['INITIAL_AMOUNT'] + df['WIN_TOTAL'] - df['BET_TOTAL']
    actual = df['FINAL_AMOUNT']
    
    difference = abs(expected - actual)
    consistent = difference <= 10  # threshold de consistencia
    
    print(f"Registros consistentes: {consistent.sum()} ({consistent.sum()/len(df)*100:.1f}%)")
    print(f"Registros inconsistentes: {(~consistent).sum()}")
    print(f"Diferencia promedio: {difference.mean():.2f}")
    print(f"Diferencia máxima: {difference.max():.2f}")
    
    if consistent.sum() < len(df) * 0.8:
        print("⚠️  ADVERTENCIA: Muchos registros inconsistentes. Considera recalcular FINAL_AMOUNT.")
    
    return consistent

In [27]:
validate_data_consistency(df_data_general)

=== VALIDACIÓN DE CONSISTENCIA DE DATOS ===
Registros consistentes: 207415 (99.2%)
Registros inconsistentes: 1664
Diferencia promedio: 4.21
Diferencia máxima: 460300.70


0         True
1         True
2         True
3         True
4         True
          ... 
226728    True
226729    True
226730    True
226731    True
226732    True
Length: 209079, dtype: bool

In [28]:
model = FixedFinalAmountModel()
results = model.train_with_zero_handling(df_data_general)
print(f"Mejores resultados: {results}")

Registros inconsistentes en FINAL_AMOUNT: 20463
Registros con FINAL_AMOUNT = 0: 8734 (4.2%)
Entrenando modelos base para obtener predicciones...
R² Win mejorado: 0.3752

=== ENTRENANDO FINAL_AMOUNT CON MÚLTIPLES ESTRATEGIAS ===
R² usando fórmula teórica: -0.1429
R² modelo híbrido: 0.1510
R² modelo ML puro: 0.1207
R² ensemble final: 0.1181
MAE ensemble: 155.94
RMSE ensemble: 242.71

=== ANÁLISIS DE ERRORES POR SEGMENTO ===
FINAL_AMOUNT = 0: MAE = 106.06, Count = 1747
FINAL_AMOUNT 0-100: MAE = 115.84, Count = 24512
FINAL_AMOUNT 100-500: MAE = 115.67, Count = 10429
FINAL_AMOUNT 500-inf: MAE = 446.52, Count = 5128
Mejores resultados: {'r2_theoretical': -0.14288665264585543, 'r2_hybrid': 0.15095737853013436, 'r2_ml': 0.12067442820075014, 'r2_ensemble': 0.11805899174324885, 'mae': 155.93871907468312, 'rmse': np.float64(242.7125129571589)}


In [4]:
class BusinessLogicCorrectModel:
    def __init__(self):
        self.scalers = {}
        self.models = {}
        
    def create_business_features(self, df, tiempo_pred=None, bet_pred=None, win_pred=None):
        """
        Crear features que reflejen la lógica real del negocio de casino
        """
        features = df[['INITIAL_AMOUNT', 'AVG_BET', 'Cluster']].copy()
        
        if tiempo_pred is not None:
            features['tiempo_pred'] = tiempo_pred
            
        if bet_pred is not None and win_pred is not None:
            # Features que reflejan el comportamiento real del casino
            features['bet_pred'] = bet_pred
            features['win_pred'] = win_pred
            
            # FEATURES CLAVE PARA CASINO:
            
            # 1. Indicadores de comportamiento de juego
            features['total_money_handled'] = bet_pred  # Dinero total manejado
            features['house_edge_effect'] = bet_pred * 0.05  # Estimación de ventaja de la casa
            features['net_gaming_result'] = win_pred - bet_pred  # Resultado neto del juego
            
            # 2. Ratios de eficiencia y riesgo
            features['win_rate'] = win_pred / (bet_pred + 1)  # Tasa de ganancia
            features['money_multiplier'] = bet_pred / (df['INITIAL_AMOUNT'] + 1)  # Cuántas veces apostó su dinero inicial
            features['reinvestment_indicator'] = np.where(bet_pred > df['INITIAL_AMOUNT'], 1, 0)  # Si reinvirtió ganancias
            
            # 3. Patrones de gestión de dinero
            features['excess_betting'] = np.maximum(0, bet_pred - df['INITIAL_AMOUNT'])  # Apuestas con dinero ganado
            features['potential_redemptions'] = win_pred * 0.7  # Estimación de dinero que podría haber retirado
            features['money_at_risk'] = np.minimum(bet_pred, df['INITIAL_AMOUNT'] + win_pred)
            
            # 4. Indicadores de comportamiento de salida
            features['likely_loss_scenario'] = np.where(win_pred < bet_pred * 0.5, 1, 0)
            features['likely_win_scenario'] = np.where(win_pred > bet_pred * 1.2, 1, 0)
            features['breakeven_scenario'] = np.where(
                (win_pred >= bet_pred * 0.8) & (win_pred <= bet_pred * 1.2), 1, 0
            )
            
            # 5. Estimaciones de flujo de efectivo durante la sesión
            # Simulación simplificada del flujo de efectivo
            available_money_estimate = df['INITIAL_AMOUNT'] + win_pred * 0.6  # Asumiendo que retira 40% de ganancias
            features['estimated_available_money'] = available_money_estimate
            features['final_money_simple_estimate'] = available_money_estimate - bet_pred + win_pred * 0.4
            
            # 6. Features específicos por cluster (comportamiento por tipo de jugador)
            features['cluster_risk_adjusted'] = df['Cluster'] * features['money_multiplier']
            features['cluster_win_pattern'] = df['Cluster'] * features['win_rate']
            
        return features
    
    def train_corrected_models(self, df):
        """
        Entrenar modelos con la lógica de negocio correcta
        """
        print("=" * 70)
        print("ENTRENANDO CON LÓGICA DE NEGOCIO CORRECTA")
        print("=" * 70)
        
        # Análisis inicial de patrones
        self.analyze_business_patterns(df)
        
        # Preparar datos base
        X_base = df[['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month']]
        y_tiempo = df['TIME_ON_DEVICE_MIN']
        y_bet = df['BET_TOTAL'] 
        y_win = df['WIN_TOTAL']
        y_final = df['FINAL_AMOUNT']
        
        # Split estratificado
        X_base_train, X_base_test, y_final_train, y_final_test = train_test_split(
            X_base, y_final, test_size=0.3, random_state=42, 
            stratify=pd.cut(y_final, bins=5, labels=False)  # Estratificar por rangos de final_amount
        )
        
        # Obtener índices para otros targets
        train_idx, test_idx = X_base_train.index, X_base_test.index
        
        print("\n=== MODELO 1: TIEMPO (OPTIMIZADO) ===")
        # Modelo tiempo mejorado
        X_tiempo_train = self.create_business_features(X_base_train)
        X_tiempo_test = self.create_business_features(X_base_test)
        
        self.scalers['tiempo'] = RobustScaler()
        X_tiempo_train_scaled = self.scalers['tiempo'].fit_transform(X_tiempo_train)
        X_tiempo_test_scaled = self.scalers['tiempo'].transform(X_tiempo_test)
        
        self.models['tiempo'] = xgb.XGBRegressor(
            n_estimators=400, max_depth=10, learning_rate=0.05, 
            subsample=0.9, colsample_bytree=0.9, random_state=42
        )
        self.models['tiempo'].fit(X_tiempo_train_scaled, y_tiempo.loc[train_idx])
        
        tiempo_pred_train = self.models['tiempo'].predict(X_tiempo_train_scaled)
        tiempo_pred_test = self.models['tiempo'].predict(X_tiempo_test_scaled)
        
        r2_tiempo = r2_score(y_tiempo.loc[test_idx], tiempo_pred_test)
        print(f"R² Tiempo: {r2_tiempo:.4f}")
        
        print("\n=== MODELO 2: BET TOTAL (OPTIMIZADO) ===")
        # Modelo bet con features de negocio
        X_bet_train = self.create_business_features(X_base_train, tiempo_pred_train)
        X_bet_test = self.create_business_features(X_base_test, tiempo_pred_test)
        
        self.scalers['bet'] = RobustScaler()
        X_bet_train_scaled = self.scalers['bet'].fit_transform(X_bet_train)
        X_bet_test_scaled = self.scalers['bet'].transform(X_bet_test)
        
        self.models['bet'] = xgb.XGBRegressor(
            n_estimators=500, max_depth=12, learning_rate=0.04,
            subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, random_state=42
        )
        self.models['bet'].fit(X_bet_train_scaled, y_bet.loc[train_idx])
        
        bet_pred_train = self.models['bet'].predict(X_bet_train_scaled)
        bet_pred_test = self.models['bet'].predict(X_bet_test_scaled)
        
        r2_bet = r2_score(y_bet.loc[test_idx], bet_pred_test)
        print(f"R² Bet Total: {r2_bet:.4f}")
        
        print("\n=== MODELO 3: WIN TOTAL (OPTIMIZADO) ===")
        # Modelo win con lógica de casino
        X_win_train = self.create_business_features(X_base_train, tiempo_pred_train, bet_pred_train)
        X_win_test = self.create_business_features(X_base_test, tiempo_pred_test, bet_pred_test)
        
        # Remover win_pred de las features para win (evitar data leakage)
        win_features_train = X_win_train.drop(['win_pred', 'net_gaming_result', 'win_rate', 
                                              'excess_betting', 'potential_redemptions',
                                              'likely_loss_scenario', 'likely_win_scenario',
                                              'breakeven_scenario', 'estimated_available_money',
                                              'final_money_simple_estimate', 'cluster_win_pattern'], axis=1, errors='ignore')
        win_features_test = X_win_test.drop(['win_pred', 'net_gaming_result', 'win_rate',
                                            'excess_betting', 'potential_redemptions', 
                                            'likely_loss_scenario', 'likely_win_scenario',
                                            'breakeven_scenario', 'estimated_available_money',
                                            'final_money_simple_estimate', 'cluster_win_pattern'], axis=1, errors='ignore')
        
        self.scalers['win'] = RobustScaler()
        X_win_train_scaled = self.scalers['win'].fit_transform(win_features_train)
        X_win_test_scaled = self.scalers['win'].transform(win_features_test)
        
        # Ensemble para WIN
        win_models = {
            'xgb1': xgb.XGBRegressor(n_estimators=500, max_depth=12, learning_rate=0.03, random_state=42),
            'xgb2': xgb.XGBRegressor(n_estimators=400, max_depth=15, learning_rate=0.04, 
                                    subsample=0.9, colsample_bytree=0.8, random_state=43),
            'rf': RandomForestRegressor(n_estimators=400, max_depth=20, min_samples_split=3, random_state=42)
        }
        
        win_predictions_train = []
        win_predictions_test = []
        
        for name, model in win_models.items():
            model.fit(X_win_train_scaled, y_win.loc[train_idx])
            win_predictions_train.append(model.predict(X_win_train_scaled))
            win_predictions_test.append(model.predict(X_win_test_scaled))
        
        # Ensemble con pesos optimizados para WIN
        win_pred_train = np.average(win_predictions_train, axis=0, weights=[0.4, 0.4, 0.2])
        win_pred_test = np.average(win_predictions_test, axis=0, weights=[0.4, 0.4, 0.2])
        
        self.models['win'] = win_models
        r2_win = r2_score(y_win.loc[test_idx], win_pred_test)
        print(f"R² Win Total: {r2_win:.4f}")
        
        print("\n=== MODELO 4: FINAL AMOUNT (LÓGICA DE NEGOCIO CORRECTA) ===")
        # Ahora con TODAS las features de lógica de negocio
        X_final_train = self.create_business_features(X_base_train, tiempo_pred_train, 
                                                     bet_pred_train, win_pred_train)
        X_final_test = self.create_business_features(X_base_test, tiempo_pred_test, 
                                                    bet_pred_test, win_pred_test)
        
        # Features adicionales específicos para FINAL_AMOUNT
        X_final_train['session_volatility'] = abs(X_final_train['win_pred'] - X_final_train['bet_pred'])
        X_final_train['money_management_score'] = X_final_train['INITIAL_AMOUNT'] / (X_final_train['bet_pred'] + 1)
        X_final_train['expected_house_profit'] = X_final_train['bet_pred'] * 0.05  # 5% house edge típico
        
        X_final_test['session_volatility'] = abs(X_final_test['win_pred'] - X_final_test['bet_pred'])
        X_final_test['money_management_score'] = X_final_test['INITIAL_AMOUNT'] / (X_final_test['bet_pred'] + 1)
        X_final_test['expected_house_profit'] = X_final_test['bet_pred'] * 0.05
        
        self.scalers['final'] = RobustScaler()
        X_final_train_scaled = self.scalers['final'].fit_transform(X_final_train)
        X_final_test_scaled = self.scalers['final'].transform(X_final_test)
        
        # Múltiples enfoques para FINAL_AMOUNT
        final_models = {
            'xgb_deep': xgb.XGBRegressor(
                n_estimators=800, max_depth=15, learning_rate=0.02,
                subsample=0.9, colsample_bytree=0.9, 
                reg_alpha=0.1, reg_lambda=0.1, random_state=42
            ),
            'xgb_wide': xgb.XGBRegressor(
                n_estimators=600, max_depth=8, learning_rate=0.03,
                subsample=0.8, colsample_bytree=1.0, random_state=43
            ),
            'gbr': GradientBoostingRegressor(
                n_estimators=500, max_depth=10, learning_rate=0.02, 
                subsample=0.9, random_state=42
            ),
            'rf': RandomForestRegressor(
                n_estimators=500, max_depth=25, min_samples_split=2,
                min_samples_leaf=1, random_state=42
            )
        }
        
        final_predictions_test = []
        final_r2_scores = []
        
        print("Entrenando ensemble para FINAL_AMOUNT:")
        for name, model in final_models.items():
            model.fit(X_final_train_scaled, y_final_train)
            pred = model.predict(X_final_test_scaled)
            final_predictions_test.append(pred)
            r2 = r2_score(y_final_test, pred)
            final_r2_scores.append(r2)
            print(f"  {name}: R² = {r2:.4f}")
        
        # Ensemble final con pesos basados en performance
        weights = np.array(final_r2_scores)
        weights = np.maximum(weights, 0)  # Solo pesos positivos
        if weights.sum() > 0:
            weights = weights / weights.sum()
        else:
            weights = np.ones(len(weights)) / len(weights)
        
        final_pred_ensemble = np.average(final_predictions_test, axis=0, weights=weights)
        
        self.models['final'] = final_models
        self.models['final_weights'] = weights
        
        # Métricas finales
        r2_final = r2_score(y_final_test, final_pred_ensemble)
        mae_final = mean_absolute_error(y_final_test, final_pred_ensemble)
        rmse_final = np.sqrt(mean_squared_error(y_final_test, final_pred_ensemble))
        
        print(f"\nRESULTADOS ENSEMBLE FINAL_AMOUNT:")
        print(f"R² = {r2_final:.4f}")
        print(f"MAE = {mae_final:.2f}")
        print(f"RMSE = {rmse_final:.2f}")
        
        # Análisis por segmentos
        self.analyze_predictions_by_segment(y_final_test, final_pred_ensemble, X_base_test)
        
        print(f"\n" + "=" * 70)
        print("RESUMEN FINAL - MODELOS CON LÓGICA DE NEGOCIO")
        print("=" * 70)
        print(f"Tiempo en máquina: {r2_tiempo:.4f}")
        print(f"Bet total: {r2_bet:.4f}")
        print(f"Win total: {r2_win:.4f}")
        print(f"Final amount: {r2_final:.4f}")
        
        return {
            'tiempo': r2_tiempo,
            'bet': r2_bet,
            'win': r2_win,
            'final': r2_final,
            'mae_final': mae_final,
            'rmse_final': rmse_final
        }
    
    def analyze_business_patterns(self, df):
        """Análisis de patrones de negocio específicos"""
        print(f"\n=== ANÁLISIS DE PATRONES DE NEGOCIO ===")
        
        # 1. Patrón de reinversión
        reinvested = df['BET_TOTAL'] > df['INITIAL_AMOUNT']
        print(f"Sesiones con reinversión de ganancias: {reinvested.sum()} ({reinvested.sum()/len(df)*100:.1f}%)")
        
        # 2. Comportamiento por resultado final
        net_winners = df['FINAL_AMOUNT'] > df['INITIAL_AMOUNT']
        net_losers = df['FINAL_AMOUNT'] < df['INITIAL_AMOUNT']
        breakeven = df['FINAL_AMOUNT'] == df['INITIAL_AMOUNT']
        
        print(f"Ganadores netos: {net_winners.sum()} ({net_winners.sum()/len(df)*100:.1f}%)")
        print(f"Perdedores netos: {net_losers.sum()} ({net_losers.sum()/len(df)*100:.1f}%)")
        print(f"Breakeven: {breakeven.sum()} ({breakeven.sum()/len(df)*100:.1f}%)")
        
        # 3. Correlación WIN vs BET (indica reinversión)
        win_bet_corr = np.corrcoef(df['WIN_TOTAL'], df['BET_TOTAL'])[0,1]
        print(f"Correlación WIN_TOTAL vs BET_TOTAL: {win_bet_corr:.3f}")
        
        # 4. Análisis de final amount vs predicciones ingenuas
        naive_final = df['INITIAL_AMOUNT'] + df['WIN_TOTAL'] - df['BET_TOTAL']
        actual_final = df['FINAL_AMOUNT']
        naive_r2 = r2_score(actual_final, naive_final)
        print(f"R² fórmula ingenua (INITIAL + WIN - BET): {naive_r2:.4f}")
        
    def analyze_predictions_by_segment(self, y_true, y_pred, X_test):
        """Analizar predicciones por segmentos de negocio"""
        print(f"\n=== ANÁLISIS POR SEGMENTOS ===")
        
        errors = abs(y_true - y_pred)
        
        # Por rangos de initial amount
        initial_ranges = [(0, 100), (100, 500), (500, 1000), (1000, float('inf'))]
        for low, high in initial_ranges:
            if high == float('inf'):
                mask = X_test['INITIAL_AMOUNT'] >= low
                label = f">= {low}"
            else:
                mask = (X_test['INITIAL_AMOUNT'] >= low) & (X_test['INITIAL_AMOUNT'] < high)
                label = f"{low}-{high}"
            
            if mask.sum() > 0:
                segment_r2 = r2_score(y_true[mask], y_pred[mask])
                segment_mae = errors[mask].mean()
                print(f"INITIAL_AMOUNT {label}: R²={segment_r2:.3f}, MAE={segment_mae:.1f}, n={mask.sum()}")
        
        # Por cluster
        for cluster in sorted(X_test['Cluster'].unique()):
            mask = X_test['Cluster'] == cluster
            if mask.sum() > 0:
                segment_r2 = r2_score(y_true[mask], y_pred[mask])
                segment_mae = errors[mask].mean()
                print(f"Cluster {cluster}: R²={segment_r2:.3f}, MAE={segment_mae:.1f}, n={mask.sum()}")



In [5]:
model = BusinessLogicCorrectModel()
results = model.train_corrected_models(df_data_general)

ENTRENANDO CON LÓGICA DE NEGOCIO CORRECTA

=== ANÁLISIS DE PATRONES DE NEGOCIO ===
Sesiones con reinversión de ganancias: 160049 (76.5%)
Ganadores netos: 36184 (17.3%)
Perdedores netos: 172469 (82.5%)
Breakeven: 426 (0.2%)
Correlación WIN_TOTAL vs BET_TOTAL: 0.870
R² fórmula ingenua (INITIAL + WIN - BET): -2.5809

=== MODELO 1: TIEMPO (OPTIMIZADO) ===
R² Tiempo: 0.6195

=== MODELO 2: BET TOTAL (OPTIMIZADO) ===
R² Bet Total: 0.6310

=== MODELO 3: WIN TOTAL (OPTIMIZADO) ===
R² Win Total: 0.5423

=== MODELO 4: FINAL AMOUNT (LÓGICA DE NEGOCIO CORRECTA) ===
Entrenando ensemble para FINAL_AMOUNT:
  xgb_deep: R² = -0.2863
  xgb_wide: R² = -0.2261
  gbr: R² = -0.2112
  rf: R² = -0.2215

RESULTADOS ENSEMBLE FINAL_AMOUNT:
R² = -0.2212
MAE = 239.31
RMSE = 591.48

=== ANÁLISIS POR SEGMENTOS ===
INITIAL_AMOUNT 0-100: R²=-0.246, MAE=80.7, n=12736
INITIAL_AMOUNT 100-500: R²=-0.332, MAE=239.5, n=42367
INITIAL_AMOUNT 500-1000: R²=-0.346, MAE=485.2, n=7254
INITIAL_AMOUNT >= 1000: R²=-0.105, MAE=863.2, n

In [14]:
class FinalConstrainedModel:
    def __init__(self):
        self.scalers = {}
        self.models = {}
        self.business_constraints = {}
        
    def apply_business_constraints(self, predictions, initial_amounts, bet_preds, win_preds):
        """
        Aplicar restricciones de negocio lógicas a las predicciones
        """
        # Restricción 1: FINAL_AMOUNT no puede ser negativo
        predictions = np.maximum(predictions, 0)
        
        # Restricción 2: FINAL_AMOUNT no puede exceder dramáticamente ciertos límites lógicos
        # Máximo teórico: INITIAL + WIN_TOTAL (si nunca hubiera apostado después de ganar)
        max_theoretical = initial_amounts + win_preds
        predictions = np.minimum(predictions, max_theoretical * 1.1)  # 10% de buffer
        
        # Restricción 3: Si WIN_TOTAL es muy bajo comparado con BET_TOTAL, 
        # es probable que FINAL_AMOUNT sea bajo
        losing_sessions = win_preds < bet_preds * 0.3  # Perdió más del 70%
        predictions[losing_sessions] = np.minimum(
            predictions[losing_sessions], 
            initial_amounts[losing_sessions] * 0.3  # Máximo 30% del dinero inicial
        )
        
        return predictions
    
    def handle_nan_values(self, df, method='median'):
        """
        Manejo robusto de valores NaN en el DataFrame
        """
        df_clean = df.copy()
        
        # Identificar columnas numéricas
        numeric_columns = df_clean.select_dtypes(include=[np.number]).columns
        
        for col in numeric_columns:
            if df_clean[col].isnull().any():
                if method == 'median':
                    fill_value = df_clean[col].median()
                elif method == 'mean':
                    fill_value = df_clean[col].mean()
                elif method == 'zero':
                    fill_value = 0
                else:
                    fill_value = df_clean[col].median()  # default
                
                df_clean[col].fillna(fill_value, inplace=True)
                print(f"Filled {df_clean[col].isnull().sum()} NaN values in {col} with {method}: {fill_value:.2f}")
        
        # Verificar que no queden NaN
        remaining_nans = df_clean.isnull().sum().sum()
        if remaining_nans > 0:
            print(f"Warning: {remaining_nans} NaN values remain after cleaning")
            # Llenar cualquier NaN restante con 0
            df_clean.fillna(0, inplace=True)
        
        return df_clean
    
    def create_advanced_business_features(self, df, tiempo_pred=None, bet_pred=None, win_pred=None):
        """
        Features avanzados basados en patrones de casino identificados
        """
        # Verificar que las columnas requeridas existan
        required_cols = ['INITIAL_AMOUNT', 'AVG_BET', 'Cluster']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        
        features = df[required_cols].copy()
        
        # Features básicos con manejo de valores extremos
        features['initial_log'] = np.log1p(np.maximum(df['INITIAL_AMOUNT'], 0))
        features['avg_bet_log'] = np.log1p(np.maximum(df['AVG_BET'], 0))
        features['bet_to_initial'] = df['AVG_BET'] / (df['INITIAL_AMOUNT'] + 1)
        
        if bet_pred is not None and win_pred is not None:
            # Asegurar que las predicciones no sean NaN
            bet_pred = np.nan_to_num(bet_pred, nan=0.0, posinf=0.0, neginf=0.0)
            win_pred = np.nan_to_num(win_pred, nan=0.0, posinf=0.0, neginf=0.0)
            
            features['bet_pred'] = bet_pred
            features['win_pred'] = win_pred
            
            # FEATURES ESPECÍFICOS PARA PREDICCIÓN DE FINAL_AMOUNT
            
            # 1. Indicadores de pérdida (82.5% pierden)
            features['expected_loss'] = bet_pred * 0.05  # House edge típico
            features['severe_loss_indicator'] = (win_pred < bet_pred * 0.3).astype(int)
            features['moderate_loss_indicator'] = ((win_pred >= bet_pred * 0.3) & (win_pred < bet_pred * 0.8)).astype(int)
            
            # 2. Gestión de bankroll
            features['bankroll_depletion'] = np.maximum(0, bet_pred - df['INITIAL_AMOUNT'] - win_pred)
            features['money_preservation_score'] = df['INITIAL_AMOUNT'] / (bet_pred + 1)
            
            # 3. Patrones de reinversión (76.5% reinvierten)
            features['reinvestment_amount'] = np.maximum(0, bet_pred - df['INITIAL_AMOUNT'])
            features['reinvestment_ratio'] = features['reinvestment_amount'] / (win_pred + 1)
            
            # 4. Probabilidad de diferentes escenarios finales
            # Basado en los patrones: 17.3% ganan, 82.5% pierden, 0.2% breakeven
            win_diff = np.clip((win_pred - bet_pred) / 100, -10, 10)  # Clip para evitar overflow
            features['win_probability'] = 1 / (1 + np.exp(-win_diff))
            features['loss_severity'] = np.maximum(0, bet_pred - win_pred) / (df['INITIAL_AMOUNT'] + 1)
            
            # 5. Features específicos por cluster (algunos clusters tienen peor performance)
            features['cluster_risk'] = df['Cluster'] * features['loss_severity']
            features['cluster_money_mgmt'] = df['Cluster'] * features['money_preservation_score']
            
            # 6. Estimaciones mejoradas de FINAL_AMOUNT basadas en patrones observados
            # Escenario conservador: El jugador retira parte de las ganancias
            conservative_final = df['INITIAL_AMOUNT'] + win_pred * 0.4 - bet_pred * 0.6
            features['conservative_estimate'] = np.maximum(0, conservative_final)
            
            # Escenario agresivo: El jugador reinvierte todo
            aggressive_final = df['INITIAL_AMOUNT'] + win_pred - bet_pred
            features['aggressive_estimate'] = np.maximum(0, aggressive_final)
            
            # Escenario promedio ponderado
            features['weighted_estimate'] = (
                0.3 * features['conservative_estimate'] + 
                0.7 * features['aggressive_estimate']
            )
            
            # 7. Features temporales si disponibles
            if tiempo_pred is not None:
                tiempo_pred = np.nan_to_num(tiempo_pred, nan=0.0, posinf=0.0, neginf=0.0)
                features['tiempo_pred'] = tiempo_pred
                features['money_burn_rate'] = bet_pred / (tiempo_pred + 1)
                features['win_rate_per_minute'] = win_pred / (tiempo_pred + 1)
        
        # Limpiar cualquier NaN que pueda haberse generado
        features = self.handle_nan_values(features, method='median')
        
        return features
    
    def train_probabilistic_final_amount(self, df):
        """
        Entrenar modelo probabilístico para FINAL_AMOUNT considerando su naturaleza estocástica
        """
        print("=" * 80)
        print("MODELO FINAL AMOUNT - ENFOQUE PROBABILÍSTICO CON RESTRICCIONES")
        print("=" * 80)
        
        # Verificar columnas requeridas
        required_cols = ['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month',
                        'TIME_ON_DEVICE_MIN','BET_TOTAL','WIN_TOTAL','FINAL_AMOUNT']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        
        # Limpiar datos de entrada
        print("Limpiando datos de entrada...")
        df_clean = self.handle_nan_values(df, method='median')
        
        # Preparar datos con modelos base ya entrenados
        X_base = df_clean[['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month']]
        y_final = df_clean['FINAL_AMOUNT']
        
        # Verificar que y_final no tenga valores infinitos o extremos
        y_final = np.clip(y_final, 0, y_final.quantile(0.99) * 2)  # Clip outliers extremos
        
        # Split estratificado por quartiles de FINAL_AMOUNT
        try:
            final_quartiles = pd.qcut(y_final, q=4, labels=False, duplicates='drop')
        except ValueError as e:
            print(f"Error en qcut: {e}. Usando split aleatorio en su lugar.")
            final_quartiles = None
        
        if final_quartiles is not None:
            X_train, X_test, y_train, y_test, q_train, q_test = train_test_split(
                X_base, y_final, final_quartiles, test_size=0.8, random_state=42, stratify=final_quartiles
            )
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X_base, y_final, test_size=0.8, random_state=42
            )
            q_test = None
        
        # Entrenar modelos base rápidamente para obtener predicciones
        print("Obteniendo predicciones de modelos base...")
        
        # Modelo tiempo simplificado
        tiempo_model = xgb.XGBRegressor(n_estimators=200, max_depth=8, random_state=42, verbosity=0)
        tiempo_model.fit(X_train, df_clean['TIME_ON_DEVICE_MIN'].loc[X_train.index])
        tiempo_pred_train = tiempo_model.predict(X_train)
        tiempo_pred_test = tiempo_model.predict(X_test)
        
        # Modelo bet simplificado
        bet_features_train = np.column_stack([X_train[['INITIAL_AMOUNT','AVG_BET','Cluster']], tiempo_pred_train])
        bet_features_test = np.column_stack([X_test[['INITIAL_AMOUNT','AVG_BET','Cluster']], tiempo_pred_test])
        bet_model = xgb.XGBRegressor(n_estimators=200, max_depth=8, random_state=42, verbosity=0)
        bet_model.fit(bet_features_train, df_clean['BET_TOTAL'].loc[X_train.index])
        bet_pred_train = bet_model.predict(bet_features_train)
        bet_pred_test = bet_model.predict(bet_features_test)
        
        # Modelo win simplificado
        win_features_train = np.column_stack([X_train[['INITIAL_AMOUNT','AVG_BET','Cluster']], 
                                            tiempo_pred_train, bet_pred_train])
        win_features_test = np.column_stack([X_test[['INITIAL_AMOUNT','AVG_BET','Cluster']], 
                                           tiempo_pred_test, bet_pred_test])
        win_model = xgb.XGBRegressor(n_estimators=300, max_depth=10, random_state=42, verbosity=0)
        win_model.fit(win_features_train, df_clean['WIN_TOTAL'].loc[X_train.index])
        win_pred_train = win_model.predict(win_features_train)
        win_pred_test = win_model.predict(win_features_test)
        
        print(f"R² modelos base - Bet: {r2_score(df_clean['BET_TOTAL'].loc[X_test.index], bet_pred_test):.3f}, "
              f"Win: {r2_score(df_clean['WIN_TOTAL'].loc[X_test.index], win_pred_test):.3f}")
        
        # Crear features avanzados para FINAL_AMOUNT
        print("\nCreando features avanzados para FINAL_AMOUNT...")
        X_final_train = self.create_advanced_business_features(X_train, tiempo_pred_train, 
                                                              bet_pred_train, win_pred_train)
        X_final_test = self.create_advanced_business_features(X_test, tiempo_pred_test, 
                                                             bet_pred_test, win_pred_test)
        
        # Verificar que no haya NaN después del feature engineering
        print(f"NaN en X_final_train: {X_final_train.isnull().sum().sum()}")
        print(f"NaN en X_final_test: {X_final_test.isnull().sum().sum()}")
        
        # Transformación robusta de features
        self.scalers['final'] = RobustScaler()
        X_final_train_scaled = self.scalers['final'].fit_transform(X_final_train)
        X_final_test_scaled = self.scalers['final'].transform(X_final_test)
        
        # Verificar que no haya NaN después del scaling
        print(f"NaN en X_final_train_scaled: {np.isnan(X_final_train_scaled).sum()}")
        print(f"NaN en X_final_test_scaled: {np.isnan(X_final_test_scaled).sum()}")
        
        # ESTRATEGIA 1: Modelos robustos a outliers
        print("\n=== ESTRATEGIA 1: MODELOS ROBUSTOS ===")
        
        robust_models = {
            'huber': HuberRegressor(epsilon=1.5, alpha=0.01, max_iter=1000),
            'xgb_robust': xgb.XGBRegressor(
                n_estimators=600, max_depth=10, learning_rate=0.02,
                subsample=0.8, colsample_bytree=0.8, 
                reg_alpha=0.2, reg_lambda=0.2, random_state=42, verbosity=0
            ),
            'elastic_net': ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=2000),
            'rf_robust': RandomForestRegressor(
                n_estimators=400, max_depth=15, min_samples_split=10,
                min_samples_leaf=5, random_state=42, n_jobs=-1
            )
        }
        
        robust_predictions_test = []
        robust_scores = []
        
        for name, model in robust_models.items():
            try:
                if name == 'elastic_net' or name == 'huber':
                    model.fit(X_final_train_scaled, y_train)
                    pred = model.predict(X_final_test_scaled)
                else:
                    model.fit(X_final_train_scaled, y_train)
                    pred = model.predict(X_final_test_scaled)
                
                # Verificar predicciones válidas
                pred = np.nan_to_num(pred, nan=0.0, posinf=0.0, neginf=0.0)
                
                # Aplicar restricciones de negocio
                pred_constrained = self.apply_business_constraints(
                    pred, X_test['INITIAL_AMOUNT'].values, bet_pred_test, win_pred_test
                )
                
                robust_predictions_test.append(pred_constrained)
                r2 = r2_score(y_test, pred_constrained)
                robust_scores.append(r2)
                print(f"  {name}: R² = {r2:.4f}")
                
            except Exception as e:
                print(f"  Error en modelo {name}: {e}")
                # Agregar predicción dummy
                dummy_pred = np.full(len(y_test), y_train.mean())
                robust_predictions_test.append(dummy_pred)
                robust_scores.append(0.0)
        
        # ESTRATEGIA 2: Modelo de quantiles (predecir rangos probables)
        print("\n=== ESTRATEGIA 2: MODELO DE QUANTILES ===")
        
        try:
            # Dividir en rangos de FINAL_AMOUNT para clasificación + regresión
            # Usar percentiles más conservadores para evitar rangos vacíos
            bins = [0, y_train.quantile(0.2), y_train.quantile(0.5), 
                   y_train.quantile(0.8), float('inf')]
            final_ranges = pd.cut(y_train, bins=bins, 
                                 labels=['low', 'medium', 'high', 'very_high'])
            
            # Verificar que todos los rangos tengan al menos algunas muestras
            range_counts = final_ranges.value_counts()
            print(f"Distribución de rangos: {range_counts.to_dict()}")
            
            range_classifier = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
            range_classifier.fit(X_final_train_scaled, final_ranges)
            
            predicted_ranges = range_classifier.predict(X_final_test_scaled)
            
            # Modelo por rango
            range_models = {}
            range_predictions = {}
            
            for range_name in ['low', 'medium', 'high', 'very_high']:
                mask = final_ranges == range_name
                if mask.sum() > 5:  # Reducir mínimo de muestras
                    try:
                        range_model = xgb.XGBRegressor(n_estimators=200, max_depth=8, 
                                                     random_state=42, verbosity=0)
                        range_model.fit(X_final_train_scaled[mask], y_train.iloc[mask.values])
                        range_models[range_name] = range_model
                        
                        # Predicción para casos de este rango en test
                        test_mask = predicted_ranges == range_name
                        if test_mask.sum() > 0:
                            range_pred = range_model.predict(X_final_test_scaled[test_mask])
                            range_pred = np.nan_to_num(range_pred, nan=0.0, posinf=0.0, neginf=0.0)
                            range_predictions[range_name] = (test_mask, range_pred)
                    except Exception as e:
                        print(f"Error en modelo de rango {range_name}: {e}")
            
            # Combinar predicciones por rango
            quantile_predictions = np.full(len(y_test), y_train.mean())  # Fallback
            for range_name, (mask, pred) in range_predictions.items():
                quantile_predictions[mask] = pred
            
            # Aplicar restricciones
            quantile_predictions = self.apply_business_constraints(
                quantile_predictions, X_test['INITIAL_AMOUNT'].values, bet_pred_test, win_pred_test
            )
            
            r2_quantile = r2_score(y_test, quantile_predictions)
            print(f"  Modelo por quantiles: R² = {r2_quantile:.4f}")
            
        except Exception as e:
            print(f"Error en modelo de quantiles: {e}")
            quantile_predictions = np.full(len(y_test), y_train.mean())
            r2_quantile = 0.0
            range_models = {}
            range_classifier = None
        
        # ESTRATEGIA 3: Ensemble final
        print("\n=== ESTRATEGIA 3: ENSEMBLE FINAL ===")
        
        # Incluir quantile predictions
        all_predictions = robust_predictions_test + [quantile_predictions]
        all_scores = robust_scores + [r2_quantile]
        
        # Pesos basados en performance (solo positivos)
        weights = np.array(all_scores)
        weights = np.maximum(weights, 0.1)  # Mínimo peso de 0.1
        weights = weights / weights.sum()
        
        final_ensemble = np.average(all_predictions, axis=0, weights=weights)
        
        # Métricas finales
        r2_ensemble = r2_score(y_test, final_ensemble)
        mae_ensemble = mean_absolute_error(y_test, final_ensemble)
        rmse_ensemble = np.sqrt(mean_squared_error(y_test, final_ensemble))
        
        # Análisis de performance por segmentos
        print(f"\n=== RESULTADOS FINALES ===")
        print(f"R² Ensemble: {r2_ensemble:.4f}")
        print(f"MAE: {mae_ensemble:.2f}")
        print(f"RMSE: {rmse_ensemble:.2f}")
        
        # Comparación con baseline
        baseline_pred = np.full(len(y_test), y_train.mean())
        baseline_r2 = r2_score(y_test, baseline_pred)
        improvement = r2_ensemble - baseline_r2
        print(f"Mejora vs baseline: {improvement:.4f}")
        
        # Análisis por quartiles (solo si q_test está disponible)
        if q_test is not None:
            print(f"\n=== ANÁLISIS POR QUARTILES DE FINAL_AMOUNT ===")
            for q in range(4):
                mask = q_test == q
                if mask.sum() > 0:
                    q_r2 = r2_score(y_test[mask], final_ensemble[mask])
                    q_mae = mean_absolute_error(y_test[mask], final_ensemble[mask])
                    q_mean = y_test[mask].mean()
                    print(f"Quartil {q+1} (mean=${q_mean:.0f}): R²={q_r2:.3f}, MAE=${q_mae:.1f}, n={mask.sum()}")
        
        # Guardar modelos
        self.models = {
            'robust_models': robust_models,
            'quantile_models': range_models if 'range_models' in locals() else {},
            'range_classifier': range_classifier,
            'weights': weights,
            'tiempo': tiempo_model,
            'bet': bet_model,
            'win': win_model
        }
        
        return {
            'r2_final': r2_ensemble,
            'mae_final': mae_ensemble,
            'rmse_final': rmse_ensemble,
            'improvement': improvement,
            'individual_scores': dict(zip(['huber', 'xgb_robust', 'elastic_net', 'rf_robust', 'quantile'], all_scores))
        }
    
    def predict_with_confidence(self, initial_amount, avg_bet, cluster, weekday, weekend, month):
        """
        Predecir FINAL_AMOUNT con intervalos de confianza
        """
        # Preparar input
        X_input = pd.DataFrame([[initial_amount, avg_bet, cluster, weekday, weekend, month]],
                              columns=['INITIAL_AMOUNT','AVG_BET','Cluster','Weekday','Weekend','Month'])
        
        # Predicciones base
        tiempo_pred = self.models['tiempo'].predict(X_input)[0]
        
        bet_input = np.array([[initial_amount, avg_bet, cluster, tiempo_pred]])
        bet_pred = self.models['bet'].predict(bet_input)[0]
        
        win_input = np.array([[initial_amount, avg_bet, cluster, tiempo_pred, bet_pred]])
        win_pred = self.models['win'].predict(win_input)[0]
        
        # Features para final amount
        X_final = self.create_advanced_business_features(X_input, tiempo_pred, bet_pred, win_pred)
        X_final_scaled = self.scalers['final'].transform(X_final)
        
        # Predicciones de todos los modelos
        predictions = []
        
        # Modelos robustos
        for name, model in self.models['robust_models'].items():
            try:
                pred = model.predict(X_final_scaled)[0]
                pred = np.nan_to_num(pred, nan=initial_amount * 0.5)
                predictions.append(pred)
            except:
                predictions.append(initial_amount * 0.5)  # Fallback conservador
        
        # Modelo quantile
        if self.models['range_classifier'] is not None:
            try:
                predicted_range = self.models['range_classifier'].predict(X_final_scaled)[0]
                if predicted_range in self.models['quantile_models']:
                    quantile_pred = self.models['quantile_models'][predicted_range].predict(X_final_scaled)[0]
                    quantile_pred = np.nan_to_num(quantile_pred, nan=initial_amount * 0.5)
                else:
                    quantile_pred = initial_amount * 0.5  # Fallback conservador
            except:
                quantile_pred = initial_amount * 0.5
                predicted_range = 'unknown'
        else:
            quantile_pred = initial_amount * 0.5
            predicted_range = 'unknown'
        
        predictions.append(quantile_pred)
        
        # Ensemble
        ensemble_pred = np.average(predictions, weights=self.models['weights'])
        
        # Aplicar restricciones
        final_pred = self.apply_business_constraints(
            np.array([ensemble_pred]), 
            np.array([initial_amount]), 
            np.array([bet_pred]), 
            np.array([win_pred])
        )[0]
        
        # Calcular intervalo de confianza basado en varianza de predicciones
        std_pred = np.std(predictions) if len(predictions) > 1 else final_pred * 0.2
        confidence_interval = (
            max(0, final_pred - 1.96 * std_pred),
            final_pred + 1.96 * std_pred
        )
        
        return {
            'tiempo_maquina': tiempo_pred,
            'bet_total': bet_pred,
            'win_total': win_pred,
            'final_amount': final_pred,
            'confidence_interval': confidence_interval,
            'predicted_range': predicted_range,
            'individual_predictions': {
                'huber': predictions[0] if len(predictions) > 0 else 0,
                'xgb_robust': predictions[1] if len(predictions) > 1 else 0,
                'elastic_net': predictions[2] if len(predictions) > 2 else 0,
                'rf_robust': predictions[3] if len(predictions) > 3 else 0,
                'quantile': predictions[4] if len(predictions) > 4 else 0
            }
        }


In [15]:
model = FinalConstrainedModel()
results = model.train_probabilistic_final_amount(df_data_general)

MODELO FINAL AMOUNT - ENFOQUE PROBABILÍSTICO CON RESTRICCIONES
Limpiando datos de entrada...
Obteniendo predicciones de modelos base...
R² modelos base - Bet: 0.586, Win: 0.351

Creando features avanzados para FINAL_AMOUNT...
NaN en X_final_train: 0
NaN en X_final_test: 0
NaN en X_final_train_scaled: 0
NaN en X_final_test_scaled: 0

=== ESTRATEGIA 1: MODELOS ROBUSTOS ===
  huber: R² = -0.4574
  xgb_robust: R² = -0.2347
  elastic_net: R² = -0.2772
  rf_robust: R² = -0.2284

=== ESTRATEGIA 2: MODELO DE QUANTILES ===
Distribución de rangos: {'high': 12544, 'medium': 12485, 'very_high': 8363, 'low': 6671}
Error en modelo de quantiles: name 'RandomForestClassifier' is not defined

=== ESTRATEGIA 3: ENSEMBLE FINAL ===

=== RESULTADOS FINALES ===
R² Ensemble: -0.1094
MAE: 230.54
RMSE: 509.20
Mejora vs baseline: -0.1094

=== ANÁLISIS POR QUARTILES DE FINAL_AMOUNT ===
Quartil 1 (mean=$0): R²=-834826.911, MAE=$148.4, n=42109
Quartil 2 (mean=$2): R²=-116319.852, MAE=$176.6, n=41523
Quartil 3 (mea