In [1]:
# ============================================================================
# C√âLULA 1: Importa√ß√µes e Configura√ß√µes
# ============================================================================
import d3rlpy
import numpy as np
import pandas as pd
import joblib
import torch
import os
import shutil
import json
from d3rlpy.algos import CQLConfig
from d3rlpy.models import QRQFunctionFactory
from d3rlpy.dataset import ReplayBuffer, FIFOBuffer
from d3rlpy.metrics import AverageValueEstimationEvaluator
from sklearn.model_selection import train_test_split

# Configura√ß√£o de sementes para reprodutibilidade
d3rlpy.seed(42)
np.random.seed(42)

print(f"d3rlpy version: {d3rlpy.__version__}")
print(f"Torch version: {torch.__version__}")
print(f"GPU dispon√≠vel? {torch.cuda.is_available()}")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from .autonotebook import tqdm as notebook_tqdm


d3rlpy version: 2.8.1
Torch version: 2.9.1+cpu
GPU dispon√≠vel? False


In [2]:
# ============================================================================
# C√âLULA 2: Carregar Artefatos Gerados (Buffer e Scalers de Assinatura)
# ============================================================================
BUFFER_PATH = "rl_assinatura_buffer.h5" # Arquivo gerado pelo Bloco 4 do Generator_NEW

print(f"\n[1/5] Carregando ReplayBuffer de Assinatura de '{BUFFER_PATH}' ...")

# Carrega o buffer espec√≠fico de assinatura
with open(BUFFER_PATH, "rb") as f:
    dataset = ReplayBuffer.load(f, FIFOBuffer(limit=100000))

print(f"‚úì Buffer de Assinatura carregado com sucesso.")
print(f"  # Transi√ß√µes totais: {dataset.transition_count}")

# Carrega os Scalers ESPEC√çFICOS de assinatura gerados pelo Generator_NEW
print("\n[2/5] Carregando Scalers de Assinatura...")
try:
    # Nota: O estado base usa o scaler comum, mas mem√≥ria √© separada
    scaler_estado_base = joblib.load("scaler_estado.joblib") 
    scaler_memoria = joblib.load("scaler_assinatura_memoria.joblib")
    
    # A√ß√£o e Recompensa s√£o espec√≠ficas (Mensalidade e LTV)
    scaler_acao = joblib.load("scaler_assinatura_acao.joblib")
    scaler_recompensa = joblib.load("scaler_assinatura_recompensa.joblib")
    
    # Carrega nomes das colunas para refer√™ncia
    with open('colunas_estado_assinatura.json', 'r') as f:
        colunas_assinatura = json.load(f)
        
    print("‚úì Scalers e metadados de assinatura carregados.")
    print(f"  Total de features no estado: {len(colunas_assinatura)}")
    
except FileNotFoundError as e:
    print(f"‚ùå ERRO: Arquivo n√£o encontrado: {e.filename}")
    print("Execute o 'Generator_NEW.py' novamente para gerar os artefatos de assinatura.")


[1/5] Carregando ReplayBuffer de Assinatura de 'rl_assinatura_buffer.h5' ...
2025-11-21 09:46.47 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(36,)]) reward_signature=Signature(dtype=[dtype('float32')], shape=[()])
2025-11-21 09:46.47 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.CONTINUOUS: 1>
2025-11-21 09:46.47 [info     ] Action size has been automatically determined. action_size=1
‚úì Buffer de Assinatura carregado com sucesso.
  # Transi√ß√µes totais: 50000

[2/5] Carregando Scalers de Assinatura...
‚úì Scalers e metadados de assinatura carregados.
  Total de features no estado: 36


In [None]:
# ============================================================================
# C√âLULA 3: Divis√£o Treino / Valida√ß√£o (Split Robusto) - CORRIGIDO
# ============================================================================
import numpy as np
from d3rlpy.dataset import Episode

print("\n[3/5] Dividindo dados em Treino e Teste...")

# Verifica se temos poucos epis√≥dios (ex: 1 epis√≥dio gigante)
if len(dataset.episodes) < 10:
    print(f"‚ö†Ô∏è Aviso: Encontrados apenas {len(dataset.episodes)} epis√≥dios.")
    print("   -> Dividindo o epis√≥dio gigante em sub-epis√≥dios menores para valida√ß√£o...")
    
    giant_ep = dataset.episodes[0]
    
    # Acessa arrays
    obs = giant_ep.observations
    act = giant_ep.actions
    rew = giant_ep.rewards
    
    # Reconstr√≥i terminais (assumindo apenas o final como terminal)
    term = np.zeros((len(obs), 1), dtype=np.float32)
    # Tenta pegar atributo terminal ou terminated
    is_global_term = getattr(giant_ep, 'terminal', getattr(giant_ep, 'terminated', True))
    if is_global_term:
        term[-1] = 1.0
        
    total_len = len(obs)
    split_idx = int(total_len * 0.8)
    
    # Divide manualmente
    train_obs, test_obs = obs[:split_idx], obs[split_idx:]
    train_act, test_act = act[:split_idx], act[split_idx:]
    train_rew, test_rew = rew[:split_idx], rew[split_idx:]
    train_term, test_term = term[:split_idx], term[split_idx:]
    
    # Helper seguro (POSICIONAL para evitar erro de keyword)
    def create_ep(o, a, r, t):
        r = r.reshape(-1, 1) if r.ndim == 1 else r
        is_term = bool(t[-1]) if len(t) > 0 else False
        # (obs, act, rew, terminated)
        return Episode(o, a, r, is_term)

    train_episodes = [create_ep(train_obs, train_act, train_rew, train_term)]
    test_episodes = [create_ep(test_obs, test_act, test_rew, test_term)]
    
else:
    # Se tivermos muitos epis√≥dios, usa o split padr√£o
    train_episodes, test_episodes = train_test_split(dataset.episodes, test_size=0.2, random_state=42)

print(f"  Epis√≥dios de Treino: {len(train_episodes)}")
print(f"  Epis√≥dios de Teste:  {len(test_episodes)}")


[3/5] Dividindo dados em Treino e Teste...
  Epis√≥dios de Treino: 40000
  Epis√≥dios de Teste:  10000


In [4]:
# ============================================================================
# C√âLULA 4: Configura√ß√£o do Agente CQL (Foco em LTV)
# ============================================================================
print("\n[4/5] Configurando Agente CQL (Assinatura/LTV)...")

# DIFEREN√áA CHAVE: gamma=0.99
# Para assinatura, o longo prazo (LTV) √© muito mais importante que o lucro imediato.
# Gamma mais alto faz o agente valorizar a reten√ß√£o futura.

cql_config = CQLConfig(
    batch_size=256,
    gamma=0.99,               # <--- Foco em Longo Prazo (LTV)
    observation_scaler=None,  # J√° escalado no Generator
    action_scaler=None,
    reward_scaler=None,
    alpha_learning_rate=1e-4,
    actor_learning_rate=1e-4,
    critic_learning_rate=3e-4,
    conservative_weight=5.0,  # Conservadorismo mant√©m-se alto
    q_func_factory=QRQFunctionFactory(n_quantiles=64)
)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
cql = cql_config.create(device=device)

print(f"‚úì Agente CQL (LTV) configurado no dispositivo: {device}")


[4/5] Configurando Agente CQL (Assinatura/LTV)...
‚úì Agente CQL (LTV) configurado no dispositivo: cpu


In [9]:
# ============================================================================
# C√âLULA 5: Treinamento com Governan√ßa (Loop Manual)
# ============================================================================
print("\n[5/5] Iniciando Treinamento de Assinatura...")

N_STEPS = 50000
STEPS_PER_EPOCH = 1000
PATIENCE = 20
BEST_MODEL_PATH = "modelo_rl_assinatura.pt" # Nome espec√≠fico para este modelo

# Avaliador
avg_q_evaluator = AverageValueEstimationEvaluator(test_episodes)
evaluators = {'average_q': avg_q_evaluator}

# Recria buffer de treino para o fitter
train_buffer = ReplayBuffer(
    FIFOBuffer(limit=dataset.transition_count),
    episodes=train_episodes
)

best_score = -float('inf')
patience_counter = 0

# Loop de Treino
for epoch, metrics in cql.fitter(
    train_buffer,
    n_steps=N_STEPS,
    n_steps_per_epoch=STEPS_PER_EPOCH,
    evaluators=evaluators,
    experiment_name="cql_assinatura_run",
    with_timestamp=False,
    show_progress=True
):
    current_score = metrics.get('average_q')
    
    if current_score > best_score:
        best_score = current_score
        patience_counter = 0
        cql.save_model(BEST_MODEL_PATH)
        print(f"    [Epoch {epoch}] Novo recorde LTV! Score: {current_score:.4f}")
    else:
        patience_counter += 1
        print(f"    [Epoch {epoch}] Sem melhora. Score: {current_score:.4f} (Paci√™ncia: {patience_counter}/{PATIENCE})")
        
    if patience_counter >= PATIENCE:
        print(f"\nüõë Early Stopping: Treino de assinatura encerrado.")
        break

if os.path.exists(BEST_MODEL_PATH):
    cql.load_model(BEST_MODEL_PATH)
    print(f"\n‚úì Melhor modelo de Assinatura carregado de '{BEST_MODEL_PATH}'.")


[5/5] Iniciando Treinamento de Assinatura...
2025-11-21 10:53.57 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(36,)]) reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)])
2025-11-21 10:53.57 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.CONTINUOUS: 1>
2025-11-21 10:53.57 [info     ] Action size has been automatically determined. action_size=1
2025-11-21 10:53.57 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(36,)]), action_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.CONTINUOUS: 1>, action_size=1)
2025-11-21 10:53.58 [info     ] Directory is created at d3rlpy_logs\cql_assinatura_run
2025-11-21 1

Epoch 1/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [05:33<00:00,  3.00it/s, critic_loss=-58.2, conservative_loss=-61, alpha=0.951, actor_loss=-0.0856, temp=0.955, temp_loss=1.39]


2025-11-21 10:59.45 [info     ] cql_assinatura_run: epoch=1 step=1000 epoch=1 metrics={'time_sample_batch': 0.009842555284500122, 'time_algorithm_update': 0.32185865807533265, 'critic_loss': -58.21702280235291, 'conservative_loss': -60.996804996490475, 'alpha': 0.9509843505620956, 'actor_loss': -0.08032187248929404, 'temp': 0.9542893256545066, 'temp_loss': 1.3902101123332977, 'time_step': 0.3320035345554352, 'average_q': -0.18790360231772066} step=1000
2025-11-21 10:59.45 [info     ] Model parameters are saved to d3rlpy_logs\cql_assinatura_run\model_1000.d3
    [Epoch 1] Novo recorde LTV! Score: -0.1879


Epoch 2/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [04:41<00:00,  3.56it/s, critic_loss=-62, conservative_loss=-64.7, alpha=0.858, actor_loss=1.06, temp=0.885, temp_loss=0.764]  


2025-11-21 11:04.39 [info     ] cql_assinatura_run: epoch=2 step=2000 epoch=2 metrics={'time_sample_batch': 0.00908719825744629, 'time_algorithm_update': 0.2705505242347717, 'critic_loss': -61.97115071105957, 'conservative_loss': -64.76444079208375, 'alpha': 0.8579332389235497, 'actor_loss': 1.06933987878263, 'temp': 0.8845283337235451, 'temp_loss': 0.7620396106541156, 'time_step': 0.2798525855541229, 'average_q': -1.0150455513812602} step=2000
2025-11-21 11:04.39 [info     ] Model parameters are saved to d3rlpy_logs\cql_assinatura_run\model_2000.d3
    [Epoch 2] Sem melhora. Score: -1.0150 (Paci√™ncia: 1/20)


Epoch 3/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [06:12<00:00,  2.68it/s, critic_loss=-67.8, conservative_loss=-74.3, alpha=0.772, actor_loss=3.34, temp=0.83, temp_loss=0.467]


2025-11-21 11:11.06 [info     ] cql_assinatura_run: epoch=3 step=3000 epoch=3 metrics={'time_sample_batch': 0.009180633783340454, 'time_algorithm_update': 0.36232124066352844, 'critic_loss': -67.94938455963134, 'conservative_loss': -74.39723878860474, 'alpha': 0.7720177013874054, 'actor_loss': 3.364053863286972, 'temp': 0.8300682639479637, 'temp_loss': 0.4650001800954342, 'time_step': 0.3717142481803894, 'average_q': -3.3496940682578833} step=3000
2025-11-21 11:11.07 [info     ] Model parameters are saved to d3rlpy_logs\cql_assinatura_run\model_3000.d3
    [Epoch 3] Sem melhora. Score: -3.3497 (Paci√™ncia: 2/20)


Epoch 4/50: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [07:54<00:00,  2.11it/s, critic_loss=-87, conservative_loss=-101, alpha=0.688, actor_loss=9.38, temp=0.79, temp_loss=0.22]    


2025-11-21 11:19.17 [info     ] cql_assinatura_run: epoch=4 step=4000 epoch=4 metrics={'time_sample_batch': 0.009501899242401123, 'time_algorithm_update': 0.4632861466407776, 'critic_loss': -87.1162483215332, 'conservative_loss': -101.60956675720215, 'alpha': 0.6871868042349816, 'actor_loss': 9.417908043146133, 'temp': 0.789750754058361, 'temp_loss': 0.2193962922496721, 'time_step': 0.4730088891983032, 'average_q': -9.578665587516129} step=4000
2025-11-21 11:19.18 [info     ] Model parameters are saved to d3rlpy_logs\cql_assinatura_run\model_4000.d3
    [Epoch 4] Sem melhora. Score: -9.5787 (Paci√™ncia: 3/20)


Epoch 5/50:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 865/1000 [04:54<00:45,  2.94it/s, critic_loss=-116, conservative_loss=-141, alpha=0.611, actor_loss=19, temp=0.769, temp_loss=0.0539]  


KeyboardInterrupt: 

In [8]:
# ============================================================================
# C√âLULA 6: Avalia√ß√£o Final e M√©tricas de Neg√≥cio - CORRIGIDA (Format Fix)
# ============================================================================
import os
import numpy as np
from d3rlpy.metrics import AverageValueEstimationEvaluator

print("\n" + "="*40)
print("AVALIA√á√ÉO DE ASSINATURA (LTV)")
print("="*40)

# 1. Construir a Rede Neural (Inicializa√ß√£o)
print("Construindo arquitetura do modelo...")
cql.build_with_dataset(dataset)

# 2. Carregar o melhor modelo salvo (Checkpoint)
if os.path.exists("modelo_rl_assinatura.pt"):
    cql.load_model("modelo_rl_assinatura.pt")
    print("‚úì Melhor modelo ('modelo_rl_assinatura.pt') carregado com sucesso.")
else:
    print("‚ö†Ô∏è Aviso: 'modelo_rl_assinatura.pt' n√£o encontrado. Usando modelo n√£o treinado.")

# 3. Infer√™ncia de Exemplo
print(f"\n--- Exemplo de Infer√™ncia (Um Caso do Teste) ---")

try:
    sample_episode = test_episodes[0]
    sample_state = sample_episode.observations[0]

    # Garantir 2D (Batch Dimension)
    state_batch = sample_state.reshape(1, -1)

    # A. Prever A√ß√£o (Mensalidade Normalizada) - Retorna Array
    acao_norm_array = cql.predict(state_batch)[0]
    
    # Extrair valor escalar para exibi√ß√£o (Corre√ß√£o do Erro)
    acao_norm_valor = float(acao_norm_array) if isinstance(acao_norm_array, np.ndarray) else acao_norm_array

    # B. Converter para Real (Mensalidade em D√≥lares)
    mensalidade_real = scaler_acao.inverse_transform(acao_norm_array.reshape(1, -1))[0][0]

    # C. Calcular LTV Esperado (Risco)
    if hasattr(cql, "predict_value"):
        action_batch = acao_norm_array.reshape(1, -1)
        ltv_esperado_norm = cql.predict_value(state_batch, action_batch)[0]
        # Usa o scaler de recompensa espec√≠fico de assinatura
        ltv_esperado_real = scaler_recompensa.inverse_transform(ltv_esperado_norm.reshape(1, -1))[0][0]
    else:
        ltv_esperado_real = 0.0

    print(f"  Estado (Shape): {sample_state.shape} (Inclui mem√≥ria)")
    print(f"  A√ß√£o (Mensalidade Normalizada): {acao_norm_valor:.4f}")
    print(f"  ------------------------------------------------")
    print(f"  ‚úÖ MENSALIDADE RECOMENDADA: ${mensalidade_real:.2f} / m√™s")
    print(f"  üí∞ LTV ESPERADO (Lifetime): ${ltv_esperado_real:.2f}")

    if mensalidade_real < 0:
        print("\n‚ùå ALERTA: Mensalidade negativa. Verifique custos de assinatura no Generator.")
    else:
        print("\n‚úÖ SUCESSO: O modelo de assinatura est√° funcional!")

except Exception as e:
    print(f"\n‚ùå Erro durante a infer√™ncia: {e}")
    # Imprime o tipo para debug
    import traceback
    traceback.print_exc()


AVALIA√á√ÉO DE ASSINATURA (LTV)
Construindo arquitetura do modelo...
‚ö†Ô∏è Aviso: 'modelo_rl_assinatura.pt' n√£o encontrado. Usando modelo n√£o treinado.

--- Exemplo de Infer√™ncia (Um Caso do Teste) ---


  acao_norm_valor = float(acao_norm_array) if isinstance(acao_norm_array, np.ndarray) else acao_norm_array


  Estado (Shape): (36,) (Inclui mem√≥ria)
  A√ß√£o (Mensalidade Normalizada): -0.1181
  ------------------------------------------------
  ‚úÖ MENSALIDADE RECOMENDADA: $91.04 / m√™s
  üí∞ LTV ESPERADO (Lifetime): $486.73

‚úÖ SUCESSO: O modelo de assinatura est√° funcional!


In [5]:
# --- Etapa 4: Formatando dados de TREINO e TESTE para o d3rlpy (CORRIGIDO v2) ---
print("\n--- Etapa 4: Formatando dados de TREINO e TESTE para o d3rlpy ---")

# Inicializa as vari√°veis globais fora do IF-block
state_features = []
category_state_features = []
observation_cols_map = []
train_replay_buffer = None
test_replay_buffer = None
train_buffer_creation_successful = False
test_buffer_creation_successful = False

if not df_train_rl.empty and not df_original.empty:

    # 4.1. Definir Features de Estado
    all_original_cols = set(df_original.columns)
    cols_to_exclude = {'Churn', 'MonthlyCharges', 'TotalCharges'}
    state_features = sorted(list(all_original_cols - cols_to_exclude))
    category_state_features = sorted(list(set(category_cols) & set(state_features)))

    print(f"Features de Estado (State) identificadas ({len(state_features)}): {state_features}")
    print(f"Features Categ√≥ricas (para One-Hot) ({len(category_state_features)}): {category_state_features}")

    # 4.2. Fun√ß√£o para processar DataFrame (Treino ou Teste)
    # --- CORRE√á√ÉO AQUI: Adicionado 'is_inference=False' ---
    def process_df_for_d3rlpy(df, state_cols, category_cols_in_state, observation_cols_map=None, is_inference=False):
        """Converte um DataFrame para o formato de arrays do d3rlpy."""

        df_processed = df.copy()

        # 1. One-Hot Encoding
        df_onehot = pd.get_dummies(df_processed[state_cols],
                                   columns=category_cols_in_state,
                                   dummy_na=False)

        # 2. Alinhamento de Colunas
        if observation_cols_map is None:
            observation_cols_map = sorted(list(df_onehot.columns))
        else:
            df_onehot = df_onehot.reindex(columns=observation_cols_map, fill_value=0)

        # 3. Criar Arrays NumPy
        observations = df_onehot[observation_cols_map].values.astype(np.float32)

        # --- CORRE√á√ÉO AQUI: S√≥ processa a√ß√µes/recompensas se N√ÉO for infer√™ncia ---
        if not is_inference:
            actions = df_processed[['Price_Action']].values.astype(np.float32)
            rewards = df_processed['Simulated_Profit_Reward'].values.astype(np.float32).reshape(-1, 1)
            terminated = np.ones_like(rewards, dtype=np.float32).reshape(-1, 1)
            return observations, actions, rewards, terminated, observation_cols_map
        else:
            # Se for infer√™ncia, s√≥ precisamos das observa√ß√µes
            return observations, None, None, None, observation_cols_map
    # -----------------------------------------------------------------

    # 4.3. Fun√ß√£o para criar ReplayBuffer
    def create_replay_buffer(observations, actions, rewards, terminated):
        """Cria um ReplayBuffer do d3rlpy a partir de arrays NumPy."""
        try:
            buffer_size = len(rewards)
            episodes = []
            for i in range(buffer_size):
                episode = Episode(
                    observations=observations[i:i+1],
                    actions=actions[i:i+1],
                    rewards=rewards[i:i+1],
                    terminated=terminated[i:i+1]
                )
                episodes.append(episode)

            replay_buffer = ReplayBuffer(
                buffer=FIFOBuffer(limit=buffer_size),
                episodes=episodes,
                cache_size=16
            )
            return replay_buffer
        except Exception as e:
            print(f"!!! ERRO CR√çTICO ao criar Buffer: {e}")
            return None

    # 4.4. Processar TREINO (is_inference continua False por defeito)
    print("\n--- Processando Conjunto de Treino ---")
    train_obs, train_act, train_rew, train_term, observation_cols_map = \
        process_df_for_d3rlpy(df_train_rl, state_features, category_state_features, observation_cols_map=None)

    print(f"N√∫mero de colunas de observa√ß√£o (features) ap√≥s one-hot: {len(observation_cols_map)}")

    train_replay_buffer = create_replay_buffer(train_obs, train_act, train_rew, train_term)
    if train_replay_buffer:
        print(f"SUCESSO: ReplayBuffer de TREINO pronto com {len(train_replay_buffer.buffer)} transi√ß√µes.")
        train_buffer_creation_successful = True

    # 4.5. Processar TESTE (is_inference continua False por defeito)
    print("\n--- Processando Conjunto de Teste ---")
    test_obs, test_act, test_rew, test_term, _ = \
        process_df_for_d3rlpy(df_test_rl, state_features, category_state_features, observation_cols_map=observation_cols_map)

    test_replay_buffer = create_replay_buffer(test_obs, test_act, test_rew, test_term)
    if test_replay_buffer:
        print(f"SUCESSO: ReplayBuffer de TESTE pronto com {len(test_replay_buffer.buffer)} transi√ß√µes.")
        test_buffer_creation_successful = True

else:
    print("PULANDO Etapa 4: Datasets de treino/teste est√£o vazios.")


--- Etapa 4: Formatando dados de TREINO e TESTE para o d3rlpy ---


NameError: name 'df_train_rl' is not defined

In [None]:
# --- Etapa 5: Configurando, Construindo e Treinando o Agente (CORRIGIDO v4) ---
print("\n--- Etapa 5: Configurando, Construindo e Treinando o Agente ---")

model_built_successfully = False
agent_trained_successfully = False
cql_pricer = None # O nome do agente

if train_buffer_creation_successful:
    try:
        print("Configurando o agente CQL com QRQFunctionFactory (para A√ß√µes Cont√≠nuas)...")

        # 5.1. Configurar o Agente
        cql_config = CQLConfig(
            q_func_factory=QRQFunctionFactory(n_quantiles=64),
            batch_size=256,
            n_action_samples=10,
            alpha_learning_rate=1e-4,
            conservative_weight=5.0
        )

        # 5.2. Criar o Agente
        device_to_use_str = "cuda" if torch.cuda.is_available() else "cpu"
        cql_pricer = cql_config.create(device=device_to_use_str)
        print(f"Agente criado e rodando em: {cql_pricer._device}")

        # 5.3. Construir o Agente com os dados
        print("Construindo o agente com as assinaturas do ReplayBuffer de TREINO...")
        cql_pricer.build_with_dataset(train_replay_buffer)
        model_built_successfully = True
        print("Agente constru√≠do com sucesso.")

        # 5.4. Treinar o Agente (Offline)
        N_TRAINING_EPOCHS = 10
        N_STEPS_PER_EPOCH = 100

        print(f"Iniciando treinamento offline por {N_TRAINING_EPOCHS} √©pocas ({N_STEPS_PER_EPOCH} steps/epoch)...")

        # --- CORRE√á√ÉO AQUI ---
        # Removidos 'scorers' e 'eval_dataset' para evitar o TypeError,
        # alinhando com o notebook de refer√™ncia (c√≥digo_final_RL_OFF).

        cql_pricer.fit(
            train_replay_buffer,
            n_steps=N_TRAINING_EPOCHS * N_STEPS_PER_EPOCH,
            n_steps_per_epoch=N_STEPS_PER_EPOCH
        )

        agent_trained_successfully = True
        print("\n--- Treinamento Conclu√≠do com Sucesso ---")

    except Exception as e:
        print(f"\n!!! ERRO CR√çTICO durante a Etapa 5 (Constru√ß√£o/Treinamento): {e} !!!")
        import traceback
        traceback.print_exc()

else:
    print("Aviso: Treinamento PULADO. 'train_replay_buffer' n√£o foi criado.")


--- Etapa 5: Configurando, Construindo e Treinando o Agente ---
Configurando o agente CQL com QRQFunctionFactory (para A√ß√µes Cont√≠nuas)...
Agente criado e rodando em: cpu
Construindo o agente com as assinaturas do ReplayBuffer de TREINO...
Agente constru√≠do com sucesso.
Iniciando treinamento offline por 10 √©pocas (100 steps/epoch)...
[2m2025-11-05 16:27.34[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(42,)]), action_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.CONTINUOUS: 1>, action_size=1)[0m
[2m2025-11-05 16:27.34[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/CQL_20251105162734[0m
[2m2025-11-05 16:27.34[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [42],

Epoch 1/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:28.09[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=1 step=100[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008755757808685302, 'time_algorithm_update': 0.3387613415718079, 'critic_loss': 87.24948081970214, 'conservative_loss': -70.18566291809083, 'alpha': 0.994828377366066, 'actor_loss': -7.992976068854332, 'temp': 0.9967465716600418, 'temp_loss': 1.408757402896881, 'time_step': 0.3476544260978699}[0m [36mstep[0m=[35m100[0m
[2m2025-11-05 16:28.09[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_100.d3[0m


Epoch 2/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:28.43[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=2 step=200[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009012703895568847, 'time_algorithm_update': 0.3325561237335205, 'critic_loss': -135.73632007598877, 'conservative_loss': -151.86371910095215, 'alpha': 0.9821200197935105, 'actor_loss': -1.0730998655594886, 'temp': 0.986779014468193, 'temp_loss': 1.5636797916889191, 'time_step': 0.3417008757591248}[0m [36mstep[0m=[35m200[0m
[2m2025-11-05 16:28.43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_200.d3[0m


Epoch 3/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:29.18[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=3 step=300[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009009251594543457, 'time_algorithm_update': 0.3390130877494812, 'critic_loss': -176.17510330200196, 'conservative_loss': -201.58816375732422, 'alpha': 0.9680697363615036, 'actor_loss': 4.017619581222534, 'temp': 0.9773836869001389, 'temp_loss': 1.4014581656455993, 'time_step': 0.34816861152648926}[0m [36mstep[0m=[35m300[0m
[2m2025-11-05 16:29.18[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_300.d3[0m


Epoch 4/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:29.53[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=4 step=400[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009188096523284912, 'time_algorithm_update': 0.337014844417572, 'critic_loss': -265.91147323608396, 'conservative_loss': -323.0822149658203, 'alpha': 0.9536868917942047, 'actor_loss': 16.677198824882506, 'temp': 0.9693650352954865, 'temp_loss': 0.9296234628558159, 'time_step': 0.3463454818725586}[0m [36mstep[0m=[35m400[0m
[2m2025-11-05 16:29.53[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_400.d3[0m


Epoch 5/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:30.27[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=5 step=500[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.008849611282348633, 'time_algorithm_update': 0.3337511348724365, 'critic_loss': -583.5466445922851, 'conservative_loss': -734.1053744506836, 'alpha': 0.9355727994441986, 'actor_loss': 60.25308586120605, 'temp': 0.9657328498363494, 'temp_loss': -0.10506286058574915, 'time_step': 0.34273936033248903}[0m [36mstep[0m=[35m500[0m
[2m2025-11-05 16:30.27[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_500.d3[0m


Epoch 6/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:31.02[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=6 step=600[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00938244104385376, 'time_algorithm_update': 0.3397496676445007, 'critic_loss': -1450.58513671875, 'conservative_loss': -1838.4996704101563, 'alpha': 0.9131769669055939, 'actor_loss': 183.2165399169922, 'temp': 0.9705697011947632, 'temp_loss': -1.2670828765630722, 'time_step': 0.34927752494812014}[0m [36mstep[0m=[35m600[0m
[2m2025-11-05 16:31.02[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_600.d3[0m


Epoch 7/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:32.33[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=7 step=700[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009200904369354248, 'time_algorithm_update': 0.900698094367981, 'critic_loss': -3174.222282714844, 'conservative_loss': -4047.823837890625, 'alpha': 0.889368606209755, 'actor_loss': 434.2947772216797, 'temp': 0.9834701561927796, 'temp_loss': -2.2141484558582305, 'time_step': 0.9100351333618164}[0m [36mstep[0m=[35m700[0m
[2m2025-11-05 16:32.33[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_700.d3[0m


Epoch 8/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:34.05[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=8 step=800[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009103341102600098, 'time_algorithm_update': 0.9102189230918885, 'critic_loss': -6006.136733398437, 'conservative_loss': -7691.232084960938, 'alpha': 0.8662542647123337, 'actor_loss': 850.3849615478516, 'temp': 1.0002096778154372, 'temp_loss': -2.9471511054039, 'time_step': 0.919463529586792}[0m [36mstep[0m=[35m800[0m
[2m2025-11-05 16:34.05[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_800.d3[0m


Epoch 9/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:35.19[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=9 step=900[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009109461307525634, 'time_algorithm_update': 0.7249660420417786, 'critic_loss': -10101.297177734376, 'conservative_loss': -12957.51720703125, 'alpha': 0.8443763309717178, 'actor_loss': 1472.6126647949218, 'temp': 1.0179425823688506, 'temp_loss': -3.58377925157547, 'time_step': 0.734211950302124}[0m [36mstep[0m=[35m900[0m
[2m2025-11-05 16:35.19[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_900.d3[0m


Epoch 10/10:   0%|          | 0/100 [00:00<?, ?it/s]

[2m2025-11-05 16:36.22[0m [[32m[1minfo     [0m] [1mCQL_20251105162734: epoch=10 step=1000[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.009180173873901368, 'time_algorithm_update': 0.6224578094482421, 'critic_loss': -15580.564853515625, 'conservative_loss': -20202.903984375, 'alpha': 0.8236886262893677, 'actor_loss': 2343.871702880859, 'temp': 1.0354898595809936, 'temp_loss': -4.040697162151337, 'time_step': 0.6317691874504089}[0m [36mstep[0m=[35m1000[0m
[2m2025-11-05 16:36.22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20251105162734/model_1000.d3[0m

--- Treinamento Conclu√≠do com Sucesso ---


In [None]:
# --- Sub-Etapa 5.5: Defini√ß√£o das Fun√ß√µes para An√°lise de Risco ---
print("\n--- Sub-Etapa 5.5: Definindo fun√ß√µes para an√°lise de risco (VaR, CVaR) ---")

def get_quantile_values(agent, observation_np, action_np):
    """Obt√©m os valores dos quantis previstos pela Q-Function (QR) do agente."""
    if not (agent and model_built_successfully and agent_trained_successfully):
        print("[Debug get_quantile_values]: Agente n√£o est√° pronto.")
        return None

    try:
        device = agent._device
        obs_tensor = torch.tensor(observation_np, dtype=torch.float32, device=device).reshape(1, -1)
        act_tensor = torch.tensor(action_np, dtype=torch.float32, device=device).reshape(1, -1)

        if not (hasattr(agent, 'impl') and hasattr(agent.impl, '_q_func_forwarder') and
                agent.impl._q_func_forwarder._forwarders):
            print("[Debug get_quantile_values]: Estrutura do agente (impl._q_func_forwarder) n√£o encontrada.")
            return None

        q_func_forwarder = agent.impl._q_func_forwarder._forwarders[0]
        critic_network = q_func_forwarder._q_func
        critic_network.eval() # Modo de avalia√ß√£o

        with torch.no_grad():
            q_output = critic_network(obs_tensor, act_tensor)

        quantile_tensor = q_output.quantiles
        quantile_values_np = quantile_tensor.cpu().numpy().squeeze()

        if isinstance(quantile_values_np, np.ndarray) and quantile_values_np.ndim == 1:
             return quantile_values_np
        else:
             print("[Debug get_quantile_values]: A sa√≠da dos quantis n√£o √© um array 1D.")
             return None

    except Exception as e:
        print(f"!!! Erro inesperado em get_quantile_values: {e} !!!")
        import traceback
        traceback.print_exc()
        return None

def calculate_var(distribution_values, alpha=0.05):
    """Calcula o Valor em Risco (VaR)."""
    if distribution_values is None or not isinstance(distribution_values, np.ndarray) or distribution_values.size == 0:
        return np.nan
    sorted_returns = np.sort(distribution_values)
    var_index = int(alpha * len(sorted_returns))
    var_index = max(0, min(var_index, len(sorted_returns) - 1))
    return sorted_returns[var_index]

def calculate_cvar(distribution_values, alpha=0.05):
    """Calcula o Valor Condicional em Risco (CVaR)."""
    if distribution_values is None or not isinstance(distribution_values, np.ndarray) or distribution_values.size == 0:
        return np.nan
    var_value = calculate_var(distribution_values, alpha)
    if np.isnan(var_value): return np.nan
    worse_than_var = distribution_values[distribution_values <= var_value]
    return np.mean(worse_than_var) if worse_than_var.size > 0 else np.nan

print("Fun√ß√µes de an√°lise de risco definidas.")


--- Sub-Etapa 5.5: Definindo fun√ß√µes para an√°lise de risco (VaR, CVaR) ---
Fun√ß√µes de an√°lise de risco definidas.


In [None]:
# --- Sub-Etapa 5.6: Avalia√ß√£o de Risco da Pol√≠tica (em dados de TESTE) ---
print("\n--- Sub-Etapa 5.6: Avalia√ß√£o de Risco da Pol√≠tica (em dados de TESTE) ---")

if agent_trained_successfully and test_buffer_creation_successful:
    SAMPLE_SIZE = min(1000, len(test_replay_buffer.buffer))
    ALPHA_RISK = 0.05

    sample_indices = np.random.choice(len(test_replay_buffer.buffer), SAMPLE_SIZE, replace=False)

    results = []
    actual_rewards_eval = []
    predicted_mean_profits_eval = []

    print(f"Processando {SAMPLE_SIZE} transi√ß√µes aleat√≥rias do ReplayBuffer de TESTE...")

    for index in sample_indices:
        try:
            transition = test_replay_buffer.episodes[index]
            obs = np.asarray(transition.observations[0])
            actual_reward = np.asarray(transition.rewards[0])[0]

            # Prever A√ß√£o √ìtima
            optimal_action = cql_pricer.predict(obs.reshape(1, -1))[0]

            # Obter Distribui√ß√£o de Quantis
            predicted_quantiles = get_quantile_values(cql_pricer, obs, optimal_action)

            if predicted_quantiles is not None and predicted_quantiles.size > 0:
                mean_profit = np.mean(predicted_quantiles)
                var_value = calculate_var(predicted_quantiles, alpha=ALPHA_RISK)
                cvar_value = calculate_cvar(predicted_quantiles, alpha=ALPHA_RISK)

                results.append({
                    'predicted_price': optimal_action[0],
                    'mean_sim_profit_agent': mean_profit,
                    'VaR_5': var_value,
                    'CVaR_5': cvar_value
                })

                actual_rewards_eval.append(actual_reward)
                predicted_mean_profits_eval.append(mean_profit)
        except Exception as e:
            print(f"Erro ao processar amostra {index}: {e}")

    # 5.7. Apresentar Resultados
    if results:
        results_df = pd.DataFrame(results)

        print(f"\n--- Resultados Agregados da Avalia√ß√£o de Risco (Buffer de TESTE) ---")
        print(f" N√≠vel Alpha: {ALPHA_RISK*100:.1f}% | Amostras: {len(results_df)}")
        print("-" * 70)
        print(f" Pre√ßo M√©dio Recomendado pela Pol√≠tica:        ${results_df['predicted_price'].mean():,.2f}")
        print(f" Lucro Simulado M√©dio PREVISTO pelo Agente:    ${results_df['mean_sim_profit_agent'].mean():,.2f}")
        print(f" VaR (5%) M√©dio Simulado (Previsto Agente): ${results_df['VaR_5'].mean(skipna=True):,.2f}")
        print(f" CVaR (5%) M√©dio Simulado (Previsto Agente):${results_df['CVaR_5'].mean(skipna=True):,.2f}")
        print("-" * 70)

        if actual_rewards_eval:
            mae_eval = mean_absolute_error(actual_rewards_eval, predicted_mean_profits_eval)
            avg_actual_profit_buffer = np.mean(actual_rewards_eval)
            print(f"\n--- M√©tricas de PRECIS√ÉO da Previs√£o M√©dia do Agente (vs Buffer TESTE) ---")
            print(f" Lucro Simulado M√©dio REAL no Buffer Avaliado: ${avg_actual_profit_buffer:,.2f}")
            print(f" MAE (Erro M√©dio Absoluto da Previs√£o M√©dia):  ${mae_eval:,.2f}")
            print("-" * 70)
    else:
        print("Nenhum resultado de avalia√ß√£o de risco foi gerado.")

else:
    print("Avalia√ß√£o de Risco PULADA. Agente ou buffer de teste n√£o est√£o prontos.")


--- Sub-Etapa 5.6: Avalia√ß√£o de Risco da Pol√≠tica (em dados de TESTE) ---
Processando 1000 transi√ß√µes aleat√≥rias do ReplayBuffer de TESTE...

--- Resultados Agregados da Avalia√ß√£o de Risco (Buffer de TESTE) ---
 N√≠vel Alpha: 5.0% | Amostras: 1000
----------------------------------------------------------------------
 Pre√ßo M√©dio Recomendado pela Pol√≠tica:        $1.00
 Lucro Simulado M√©dio PREVISTO pelo Agente:    $-2,649.53
 VaR (5%) M√©dio Simulado (Previsto Agente): $-7,438.05
 CVaR (5%) M√©dio Simulado (Previsto Agente):$-8,913.42
----------------------------------------------------------------------

--- M√©tricas de PRECIS√ÉO da Previs√£o M√©dia do Agente (vs Buffer TESTE) ---
 Lucro Simulado M√©dio REAL no Buffer Avaliado: $10.12
 MAE (Erro M√©dio Absoluto da Previs√£o M√©dia):  $2,659.65
----------------------------------------------------------------------


In [None]:
# --- Etapa 6: Gerando Recomenda√ß√µes de Pre√ßo Espec√≠ficas (CORRIGIDO) ---
print("\n--- Etapa 6: Gerando Recomenda√ß√µes de Pre√ßo Espec√≠ficas ---")

# Vari√°veis globais necess√°rias (da Etapa 4)
# state_features, observation_cols_map, category_state_features

def get_price_recommendation(**scenario_kwargs):
    """
    Gera uma recomenda√ß√£o de pre√ßo para um cen√°rio de cliente.
    As kwargs devem corresponder √†s 'state_features' do dataset de churn.
    """

    if not (agent_trained_successfully and 'observation_cols_map' in globals() and observation_cols_map):
        print("Erro: Agente n√£o treinado ou 'observation_cols_map' n√£o definido.")
        return

    # 1. Criar DataFrame do cen√°rio
    default_scenario = {}
    for col in state_features:
        if col in df_original.columns:
            if df_original[col].dtype == 'object':
                default_scenario[col] = df_original[col].mode()[0]
            else:
                default_scenario[col] = df_original[col].median()
        else:
            default_scenario[col] = 0

    for key, value in scenario_kwargs.items():
        if key in default_scenario:
            default_scenario[key] = value

    scenario_df = pd.DataFrame([default_scenario])

    # 2. Processar o cen√°rio (One-hot e Alinhamento)
    try:
        # --- CORRE√á√ÉO AQUI: Passa 'is_inference=True' ---
        scenario_obs, _, _, _, _ = process_df_for_d3rlpy(
            scenario_df,
            state_features,
            category_state_features,
            observation_cols_map=observation_cols_map,
            is_inference=True  # Diz √† fun√ß√£o para n√£o procurar 'Price_Action'
        )
        observation = scenario_obs.reshape(1, -1)

    except Exception as e:
        print(f"Erro ao processar o cen√°rio: {e}")
        return

    # 3. Prever A√ß√£o (Pre√ßo) e Risco
    try:
        recommended_price = cql_pricer.predict(observation)[0]

        print(f"\nCen√°rio:")
        print(json.dumps(scenario_kwargs, indent=2))
        print(f"  => Pre√ßo Recomendado: ${recommended_price[0]:.2f}")

        # 4. Calcular Risco (VaR/CVaR)
        predicted_quantiles = get_quantile_values(cql_pricer, observation, recommended_price)

        if predicted_quantiles is not None and predicted_quantiles.size > 0:
            mean_profit = np.mean(predicted_quantiles)
            var_5 = calculate_var(predicted_quantiles, alpha=0.05)
            cvar_5 = calculate_cvar(predicted_quantiles, alpha=0.05)
            print(f"     Lucro M√©dio Previsto: ${mean_profit:.2f}")
            print(f"     VaR (5%): ${var_5:.2f} | CVaR (5%): ${cvar_5:.2f}")
        else:
            print("     (N√£o foi poss√≠vel calcular VaR/CVaR para esta recomenda√ß√£o)")
        print("-" * 30)

    except Exception as e:
        print(f"Erro durante a predi√ß√£o para o cen√°rio: {e}")

# --- Exemplos de Recomenda√ß√£o ---
if agent_trained_successfully:
    print("Gerando recomenda√ß√µes de exemplo...")
    get_price_recommendation(
        SubscriptionType='Standard',
        Gender='Female',
        DeviceRegistered='Tablet',
        ViewingHoursPerWeek=40
    )

    get_price_recommendation(
        SubscriptionType='Basic',
        Gender='Male',
        DeviceRegistered='Mobile',
        AccountAge=5 # Baixa idade
    )

    get_price_recommendation(
        SubscriptionType='Premium',
        ViewingHoursPerWeek=50,
        ContentDownloadsPerMonth=30,
        UserRating=4.5
    )
else:
    print("Recomenda√ß√µes PULADAS. Agente n√£o treinado.")


--- Etapa 6: Gerando Recomenda√ß√µes de Pre√ßo Espec√≠ficas ---
Gerando recomenda√ß√µes de exemplo...

Cen√°rio:
{
  "SubscriptionType": "Standard",
  "Gender": "Female",
  "DeviceRegistered": "Tablet",
  "ViewingHoursPerWeek": 40
}
  => Pre√ßo Recomendado: $1.00
     Lucro M√©dio Previsto: $-2941.21
     VaR (5%): $-8269.86 | CVaR (5%): $-9912.32
------------------------------

Cen√°rio:
{
  "SubscriptionType": "Basic",
  "Gender": "Male",
  "DeviceRegistered": "Mobile",
  "AccountAge": 5
}
  => Pre√ßo Recomendado: $1.00
     Lucro M√©dio Previsto: $-2394.98
     VaR (5%): $-6443.26 | CVaR (5%): $-7693.88
------------------------------

Cen√°rio:
{
  "SubscriptionType": "Premium",
  "ViewingHoursPerWeek": 50,
  "ContentDownloadsPerMonth": 30,
  "UserRating": 4.5
}
  => Pre√ßo Recomendado: $1.00
     Lucro M√©dio Previsto: $-3246.75
     VaR (5%): $-9107.43 | CVaR (5%): $-10914.38
------------------------------


In [None]:
# --- Etapa 9: Compara√ß√£o com Aprendizado Supervisionado (SL-Regressor) ---
print("\n--- Etapa 9: Iniciando Compara√ß√£o com Supervised Learning (LGBM) ---")

if not df_rl_training.empty and 'observation_cols_map' in globals():

    print("Formatando dados para o modelo de SL...")

    # Re-processa o df_train_rl
    df_train_sl_X, _, _, _, _ = process_df_for_d3rlpy(
        df_train_rl, state_features, category_state_features, observation_cols_map
    )
    df_train_sl_X = pd.DataFrame(df_train_sl_X, columns=observation_cols_map)
    df_train_sl_X['Price_Action'] = df_train_rl['Price_Action']

    y_train_sl = df_train_rl['Simulated_Profit_Reward']

    # Repete para o Teste
    df_test_sl_X, _, _, _, _ = process_df_for_d3rlpy(
        df_test_rl, state_features, category_state_features, observation_cols_map
    )
    df_test_sl_X = pd.DataFrame(df_test_sl_X, columns=observation_cols_map)
    df_test_sl_X['Price_Action'] = df_test_rl['Price_Action']

    y_test_sl = df_test_rl['Simulated_Profit_Reward']

    print(f"Dados SL divididos em {len(df_train_sl_X)} para treino e {len(df_test_sl_X)} para teste.")

    # 9.2. Treinar o Regressor LightGBM
    print("\nTreinando o modelo LightGBM para prever o lucro...")

    lgbm_regressor = lgb.LGBMRegressor(random_state=42, n_estimators=200)
    lgbm_regressor.fit(df_train_sl_X, y_train_sl)
    print("Modelo SL (Regressor) treinado com sucesso!")

    # 9.3. Avaliar a PRECIS√ÉO do Regressor
    print("\nAvaliando a precis√£o do modelo SL no conjunto de teste...")
    y_pred_sl = lgbm_regressor.predict(df_test_sl_X)

    mae_sl = mean_absolute_error(y_test_sl, y_pred_sl)
    r2_sl = r2_score(y_test_sl, y_pred_sl)

    print(f"----------- M√âTRICAS DE PRECIS√ÉO (Regress√£o do Lucro) -----------")
    print(f"Erro M√©dio Absoluto (MAE): ${mae_sl:,.2f}")
    print(f"R-quadrado (R¬≤): {r2_sl:.2%}")
    print("----------------------------------------------------------------")
    print("Nota: Isto mede o qu√£o bem o LGBM 'decorou' a fun√ß√£o de simula√ß√£o.")

else:
    print("PULANDO Etapa 9: Dataset 'df_rl_training' est√° vazio ou mapa de colunas n√£o foi criado.")


--- Etapa 9: Iniciando Compara√ß√£o com Supervised Learning (LGBM) ---
Formatando dados para o modelo de SL...
Dados SL divididos em 38520 para treino e 9630 para teste.

Treinando o modelo LightGBM para prever o lucro...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1294
[LightGBM] [Info] Number of data points in the train set: 38520, number of used features: 43
[LightGBM] [Info] Start training from score 9.997657
Modelo SL (Regressor) treinado com sucesso!

Avaliando a precis√£o do modelo SL no conjunto de teste...
----------- M√âTRICAS DE PRECIS√ÉO (Regress√£o do Lucro) -----------
Erro M√©dio Absoluto (MAE): $0.01
R-quadrado (R¬≤): 100.00%
----------------------------------------------------------------
Nota: Isto mede o qu√£o bem o LGBM 'decorou' a fun√ß√£o de simula√ß√£o.


In [None]:
# --- Etapa 10: Salvando o Modelo e Componentes ---
print("\n--- Etapa 10: Salvando o Modelo e Componentes ---")

if agent_trained_successfully:
    try:
        cql_pricer.save_model('modelo_rl_churn_pricer.pt')
        print("Modelo salvo como 'modelo_rl_churn_pricer.pt'")

        with open('colunas_observacao_churn.json', 'w') as f:
            json.dump(observation_cols_map, f)
        print("Colunas de observa√ß√£o salvas como 'colunas_observacao_churn.json'")

        with open('config_tiers_churn.json', 'w') as f:
            json.dump(product_tiers, f)
        print("Configura√ß√£o de Tiers salva como 'config_tiers_churn.json'")

    except Exception as e:
        print(f"Erro ao salvar arquivos: {e}")
else:
    print("Salvamento PULADO. Agente n√£o foi treinado.")

print("\n--- FIM DO SCRIPT ---")


--- Etapa 10: Salvando o Modelo e Componentes ---
Modelo salvo como 'modelo_rl_churn_pricer.pt'
Colunas de observa√ß√£o salvas como 'colunas_observacao_churn.json'
Configura√ß√£o de Tiers salva como 'config_tiers_churn.json'

--- FIM DO SCRIPT ---
