In [5]:
#Importamos las librerías que vamos a usar
import os
from pathlib import Path
import json
import pickle
from dr import *
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

# Reproducibilidad
torch.manual_seed(42)
np.random.seed(42)

# Configuración general
DATASET = "netflix"
NUM_ITEMS = 752 
NUM_GROUPS = 8
CONTEXT_LENGTH = 20

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Rutas
root = Path("..").resolve()
processed_path = root / "data" / "processed" / "trajectories_train.pkl"
test_path = root / "data" / "test_users" / "netflix8_test.json"

# Cargar trayectorias de training
with open(processed_path, "rb") as f:
    train_trajectories = pickle.load(f)

# Cargar usuarios de test
with open(test_path, "r") as f:
    test_users = json.load(f)

# Instanciar el modelo con los hiperparámetros que te pasó tu amiga
model = DecisionTransformer(
    num_items=752, 
    num_groups=8, 
    hidden_dim=512, 
    n_layers=2, 
    n_heads=4, 
    context_length=25, 
    max_timestep=200, 
    dropout=0.1,
)

# Cargar pesos entrenados
ckpt_path = Path("final_decision_transformer_model3.pt")  # ajustá ruta si está en otra carpeta
state_dict = torch.load(ckpt_path, map_location=DEVICE)
model.load_state_dict(state_dict)  # ahora debería decir: <All keys matched successfully>

# Preparar para inferencia
model.to(DEVICE)
model.eval()

print("Modelo cargado desde:", ckpt_path)

Modelo cargado desde: final_decision_transformer_model3.pt


In [6]:
import torch
import numpy as np
from math import log2

@torch.no_grad()
def evaluate_model_batched(
    model,
    test_data,
    device,
    target_return=None,
    k_list=(5, 10, 20),
    context_len=20,
    eval_batch_size=1024
):
    """
    Evaluación batcheada y GPU-native para usuarios con longitudes distintas.

    Args:
        model: Decision Transformer que acepta (states, actions, rtg, timesteps, groups)
        test_data: lista de dicts {'group': int, 'items': List[int], 'ratings': List[float]}
        device: torch.device
        target_return: float or None (si None usa suma de ratings de la ventana)
        k_list: tupla/lista de K para métricas
        context_len: longitud de la historia a usar (ventana)
        eval_batch_size: tamaño de batch para la inferencia (ajustar por memoria)
    Returns:
        dict con HR@k, NDCG@k, MRR (floats)
    """

    model.eval()

    # 1) Construir dataset de ventanas (en CPU) - cada muestra = una historia de length context_len + target
    states_list = []
    actions_list = []
    rtg_list = []
    groups_list = []
    targets_list = []

    for user in test_data:
        group = int(user['group'])
        items = user['items']
        ratings = user['ratings']

        L = len(items)
        if L <= context_len:
            continue  # no hay ventana válida

        # generar ventanas
        for t in range(context_len, L):
            hist_items = items[t-context_len:t]               # length = context_len
            hist_ratings = ratings[t-context_len:t]

            rtg_val = (sum(hist_ratings) if target_return is None else float(target_return))

            states_list.append(hist_items)
            actions_list.append(hist_items)   # en tu código original actions == states
            rtg_list.append([rtg_val] * context_len)  # replicar por paso temporal si tu modelo espera RTG por paso
            groups_list.append(group)
            targets_list.append(items[t])

    # Si no hay muestras válidas
    if len(states_list) == 0:
        return {f'HR@{k}': 0.0 for k in k_list} | {f'NDCG@{k}': 0.0 for k in k_list} | {'MRR': 0.0}

    # 2) Convertir todo a tensores y mover a device UNA VEZ (evita múltiples CPU->GPU)
    states = torch.tensor(states_list, dtype=torch.long, device=device)        # (N, context_len)
    actions = torch.tensor(actions_list, dtype=torch.long, device=device)      # (N, context_len)
    rtg = torch.tensor(rtg_list, dtype=torch.float32, device=device).unsqueeze(-1)  # (N, context_len, 1)
    groups = torch.tensor(groups_list, dtype=torch.long, device=device)        # (N,)
    targets = torch.tensor(targets_list, dtype=torch.long, device=device)      # (N,)

    N = states.size(0)
    num_ks = len(k_list)

    # Precompute timesteps (se puede broadcastear por batch)
    # Model en tu ejemplo aceptaba timesteps shape (batch, context_len)
    timesteps_single = torch.arange(context_len, dtype=torch.long, device=device).unsqueeze(0)  # (1, context_len)

    # 3) Acumuladores para métricas (mantener en GPU)
    hr_sums = torch.zeros(num_ks, device=device, dtype=torch.float64)   # usamos float64 para mayor estabilidad al acumular
    ndcg_sums = torch.zeros(num_ks, device=device, dtype=torch.float64)
    mrr_sum = torch.tensor(0.0, device=device, dtype=torch.float64)
    total = 0

    # Precompute denominators for NDCG (log2 ranks)
    max_k = max(k_list)
    ranks = torch.arange(1, max_k + 1, device=device, dtype=torch.float32)  # (max_k,)
    discount = torch.log2(ranks + 1.0)  # (max_k,)

    # 4) Inferencia en mini-batches
    for start in range(0, N, eval_batch_size):
        end = min(start + eval_batch_size, N)
        b = end - start

        s_batch = states[start:end]           # (b, context_len)
        a_batch = actions[start:end]          # (b, context_len)
        r_batch = rtg[start:end]              # (b, context_len, 1)
        g_batch = groups[start:end]           # (b,)
        t_batch = targets[start:end]          # (b,)

        # timesteps repeat para el batch
        ts_batch = timesteps_single.expand(b, -1)  # (b, context_len)

        # forward (asume salida logits (b, seq_len, num_items) )
        logits = model(s_batch, a_batch, r_batch, ts_batch, g_batch)  # (b, seq_len, num_items)
        preds = logits[:, -1, :]   # (b, num_items)  <-- scores para cada item

        # --- HR@K and NDCG@K ---
        # obtener top max_k indices y scores
        topk_vals, topk_idx = torch.topk(preds, k=max_k, dim=1)  # (b, max_k)
        # targets comparacion
        # shape targets -> (b,1) for broadcasting
        eq = (topk_idx == t_batch.unsqueeze(1))  # (b, max_k) bool

        # Para cada K en k_list computar HR y NDCG
        for i, k in enumerate(k_list):
            eq_k = eq[:, :k]                                 # (b, k)
            hits = eq_k.any(dim=1).to(torch.float32)         # (b,)
            hr_sums[i] += hits.sum().to(torch.float64)

            # NDCG: relevance is 1 only where eq_k True, DCG = sum(relevance / log2(rank+1))
            # discount[:k] -> (k,)
            relevance = eq_k.to(torch.float32)              # (b, k)
            dcg = (relevance / discount[:k]).sum(dim=1)     # (b,)
            ndcg_sums[i] += dcg.sum().to(torch.float64)

        # --- MRR ---
        # Fast rank calculation without sorting fully:
        # rank_i = 1 + sum_j (preds_ij > preds_i,target)
        # target scores
        idx = torch.arange(preds.size(0), device=device)
        target_scores = preds[idx, t_batch]                 # (b,)
        # count how many items have strictly greater score than target score
        better_count = (preds > target_scores.unsqueeze(1)).sum(dim=1).to(torch.float32)  # (b,)
        ranks_tensor = better_count + 1.0
        rr = (1.0 / ranks_tensor).to(torch.float64)        # (b,)
        mrr_sum += rr.sum()

        total += b

    # 5) Calcular promedios finales (mover a CPU 1 vez con .item())
    result = {}
    total = float(total)  # convertir a float Python

    for i, k in enumerate(k_list):
        result[f'HR@{k}'] = float((hr_sums[i] / total).item())
        result[f'NDCG@{k}'] = float((ndcg_sums[i] / total).item())

    result['MRR'] = float((mrr_sum / total).item())

    return result


In [8]:
evaluate_model_batched(model, test_users, DEVICE)

{'HR@5': 0.006830601092896175,
 'NDCG@5': 0.004585265277431963,
 'HR@10': 0.014343408469945355,
 'NDCG@10': 0.00696252257385052,
 'HR@20': 0.030364583333333334,
 'NDCG@20': 0.010931653772058382,
 'MRR': 0.010711217664413367}