In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!wget --no-check-certificate https://data.recsys.synerise.com/dataset/ubc_data/ubc_data.tar.gz

In [None]:
import tarfile

# Path to your tar.gz file
tar_path = "/kaggle/working/ubc_data.tar.gz"

# Extract files
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall("ubc_data_extracted")  # creates folder with extracted files
    print("Files extracted to 'ubc_data_extracted'")
    print("Contents:", tar.getnames())  # list of files inside


In [None]:
"""
RecSys RL Recommender (Double-DQN) with Propensity Integration
---------------------------------------------------------------
This single-file pipeline:
- Loads Parquet event files and target propensity `.npy` arrays from an extracted folder (e.g., 'ubc_data_extracted')
- Performs exploration and cleaning
- Builds sessionized episodes
- Constructs stable feature vectors for (state, candidate)
- Integrates product propensity (popularity propensity) as a feature and supports re-ranking by propensity
- Implements a dense-reward environment
- Trains a Double + Dueling DQN policy (policy + target nets)
- Evaluates offline with HR@K, NDCG@K, and reward/session

Notes:
- Edit DATA_DIR to point to the folder where you extracted ubc_data.tar.gz (folder that contains the parquet files and target/ folder).
- This file assumes parquet and numpy files are present as described previously.
- Use USE_SYNTHETIC=True for a small in-memory test instead of reading files.

Author: ChatGPT (GPT-5 Thinking mini)
"""

from __future__ import annotations
import os
import math
import json
import time
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional, Any

import numpy as np
import pandas as pd
from collections import defaultdict, Counter, deque

import torch
import torch.nn as nn
import torch.nn.functional as F

# -----------------------------
# Config
# -----------------------------
USE_SYNTHETIC = False
DATA_DIR = "/kaggle/working/ubc_data_extracted"  # <- set this to your extracted folder path
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Embedding dims
EMB_DIM_ITEM = 128
EMB_DIM_QUERY = 128
EMB_DIM_URL = 64

# Sessionization
SESSION_GAP_MIN = 30
MAX_STEPS_PER_SESSION = 20

# Candidate gen
CANDIDATE_POOL_SIZE = 200

# Reward weights (dense reward)
W_BUY = 5.0
W_A2C = 2.0
W_RMC = 1.0
W_CAT = 0.5
W_PV = 0.2
W_QRY = 0.2
W_DIV = 0.1
W_REP = 0.2
TIME_DECAY_TAU_SEC = 60.0
GAMMA = 0.9

# Training
BATCH_SIZE = 128
REPLAY_CAP = 200_000
LR = 1e-3
EPSILON_START = 0.2
EPSILON_END = 0.05
EPSILON_DECAY_STEPS = 50_000
TARGET_UPDATE_FREQ = 2_000
TRAIN_STEPS = 50_000
EVAL_EVERY = 5_000
TOPK = 10

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -----------------------------
# Utilities
# -----------------------------

def parse_ts(s: Any) -> pd.Timestamp:
    return pd.to_datetime(s, utc=True, errors='coerce')

class HashVectorizer:
    def __init__(self, dim: int, seed: int = 42):
        self.dim = dim
        self.seed = seed
    def _bytes(self, x: Any) -> bytes:
        if isinstance(x, bytes): return x
        if isinstance(x, (list, tuple)): return json.dumps(x, sort_keys=True).encode('utf-8')
        return str(x).encode('utf-8')
    def transform_one(self, x: Any) -> np.ndarray:
        b = self._bytes(x)
        rs = np.random.RandomState(abs(hash((b, self.seed))) % (2**32))
        v = rs.normal(0, 1.0, size=self.dim)
        v /= (np.linalg.norm(v) + 1e-8)
        return v.astype(np.float32)

item_vec = HashVectorizer(EMB_DIM_ITEM)
query_vec = HashVectorizer(EMB_DIM_QUERY)
url_vec = HashVectorizer(EMB_DIM_URL)

def decode_embedding(obj: Any, kind: str = "item") -> np.ndarray:
    if kind == "item": return item_vec.transform_one(obj)
    if kind == "query": return query_vec.transform_one(obj)
    if kind == "url": return url_vec.transform_one(obj)
    raise ValueError("unknown kind")

# -----------------------------
# Data loading
# -----------------------------

def load_parquets_and_targets(data_dir: str) -> Dict[str, Any]:
    """Load parquet event files and target propensity numpy arrays.
    Expects files described earlier.
    """
    if USE_SYNTHETIC:
        return make_synth_data_for_pipeline()

    def rp(fname):
        p = os.path.join(data_dir, fname)
        if not os.path.exists(p):
            raise FileNotFoundError(p)
        return pd.read_parquet(p)

    data = {}
    data['product_properties'] = rp('product_properties.parquet')
    data['product_buy'] = rp('product_buy.parquet')
    data['add_to_cart'] = rp('add_to_cart.parquet')
    data['remove_from_cart'] = rp('remove_from_cart.parquet')
    data['page_visit'] = rp('page_visit.parquet')
    data['search_query'] = rp('search_query.parquet')

    # load numpy targets if present
    targ_dir = os.path.join(data_dir, 'target')
    if os.path.isdir(targ_dir):
        def ln(p):
            f = os.path.join(p)
            if os.path.exists(f):
                return np.load(f, allow_pickle=True)
            return None
        data['propensity_sku'] = None
        data['propensity_category'] = None
        sku_p = os.path.join(targ_dir, 'popularity_propensity_sku.npy')
        cat_p = os.path.join(targ_dir, 'popularity_propensity_category.npy')
        if os.path.exists(sku_p):
            data['propensity_sku'] = np.load(sku_p, allow_pickle=True) if os.path.getsize(sku_p)>0 else None
        if os.path.exists(cat_p):
            data['propensity_category'] = np.load(cat_p, allow_pickle=True) if os.path.getsize(cat_p)>0 else None
    else:
        data['propensity_sku'] = None
        data['propensity_category'] = None

    return data

# -----------------------------
# (Optional) Synthetic data helper
# -----------------------------

def make_synth_data_for_pipeline():
    # For brevity return minimal synthetic datasets similar to earlier function
    n_users=200; n_items=1000
    rng = np.random.RandomState(RANDOM_SEED)
    prod = pd.DataFrame({'sku': np.arange(n_items), 'category': rng.randint(0,20,n_items), 'price': rng.randint(0,10,n_items), 'embedding': [f'pq_{i}' for i in range(n_items)]})
    rows_buy=[]; rows_a2c=[]; rows_rmc=[]; rows_pv=[]; rows_qry=[]
    base = pd.Timestamp('2025-01-01', tz='UTC')
    for u in range(n_users):
        t = base
        for s in range(rng.randint(3,10)):
            t += pd.Timedelta(minutes=rng.randint(1,60))
            if rng.rand()<0.5:
                sku = rng.randint(0,n_items)
                rows_a2c.append({'client_id':u,'timestamp':t,'sku':sku})
                if rng.rand()<0.5:
                    rows_buy.append({'client_id':u,'timestamp':t+pd.Timedelta(minutes=1),'sku':sku})
            if rng.rand()<0.3:
                rows_pv.append({'client_id':u,'timestamp':t,'url':rng.randint(0,200)})
            if rng.rand()<0.2:
                rows_qry.append({'client_id':u,'timestamp':t,'query':f'q_{rng.randint(0,50)}'})
    return {
        'product_properties': prod,
        'product_buy': pd.DataFrame(rows_buy),
        'add_to_cart': pd.DataFrame(rows_a2c),
        'remove_from_cart': pd.DataFrame(rows_rmc),
        'page_visit': pd.DataFrame(rows_pv),
        'search_query': pd.DataFrame(rows_qry),
        'propensity_sku': {i: float(0.01 + (i%50)/100.0) for i in range(prod.shape[0])},
        'propensity_category': {c: 0.05 + (c%10)/100.0 for c in range(20)}
    }

# -----------------------------
# Cleaning & sessionization
# -----------------------------

def clean_and_sessionize(data: Dict[str, Any]) -> pd.DataFrame:
    prod = data['product_properties'].copy()
    prod['sku'] = prod['sku'].astype('int64')
    prod['category'] = prod['category'].astype('int64')
    prod['price'] = prod['price'].astype('int64')
    data['product_properties'] = prod

    for k in ['product_buy','add_to_cart','remove_from_cart']:
        df = data[k].copy()
        df['client_id'] = df['client_id'].astype('int64')
        df['sku'] = df['sku'].astype('int64')
        df['timestamp'] = df['timestamp'].apply(parse_ts)
        data[k] = df.sort_values(['client_id','timestamp']).reset_index(drop=True)

    pv = data['page_visit'].copy()
    pv['client_id'] = pv['client_id'].astype('int64')
    pv['url'] = pv['url'].astype('int64')
    pv['timestamp'] = pv['timestamp'].apply(parse_ts)
    data['page_visit'] = pv.sort_values(['client_id','timestamp']).reset_index(drop=True)

    sq = data['search_query'].copy()
    sq['client_id'] = sq['client_id'].astype('int64')
    sq['timestamp'] = sq['timestamp'].apply(parse_ts)
    data['search_query'] = sq.sort_values(['client_id','timestamp']).reset_index(drop=True)

    # unify
    def add(df, etype):
        if df is None or df.shape[0]==0:
            return pd.DataFrame()
        df2 = df.copy()
        df2['event_type'] = etype
        return df2
    dfs = [add(data.get('product_buy'), 'buy'), add(data.get('add_to_cart'),'a2c'), add(data.get('remove_from_cart'),'rmc'), add(data.get('page_visit'),'pv'), add(data.get('search_query'),'qry')]
    allv = pd.concat(dfs, ignore_index=True, sort=False).sort_values(['client_id','timestamp']).reset_index(drop=True)

    gap = pd.Timedelta(minutes=SESSION_GAP_MIN)
    allv['prev_ts'] = allv.groupby('client_id')['timestamp'].shift(1)
    new_sess = (allv['prev_ts'].isna()) | ((allv['timestamp'] - allv['prev_ts']) > gap)
    allv['session_id'] = new_sess.groupby(allv['client_id']).cumsum()
    allv['step'] = allv.groupby(['client_id','session_id']).cumcount()
    allv = allv.drop(columns=['prev_ts'])
    return allv

# -----------------------------
# URL->category co-occurrence
# -----------------------------

def build_url_category_model(pv: pd.DataFrame, item_events: pd.DataFrame, prod: pd.DataFrame, window_min: int = 10) -> Dict[int, Dict[int, float]]:
    if pv.empty or item_events.empty:
        return {}
    window = pd.Timedelta(minutes=window_min)
    prod_cat = prod.set_index('sku')['category'].to_dict()
    counts = defaultdict(Counter)
    item_by_client = item_events.groupby('client_id')
    for client, gpv in pv.groupby('client_id'):
        if client not in item_by_client.groups: continue
        gi = item_by_client.get_group(client).reset_index(drop=True)
        i_idx = 0
        for _, row in gpv.iterrows():
            t = row['timestamp']
            while i_idx < len(gi) and gi.loc[i_idx, 'timestamp'] < t:
                i_idx += 1
            j = i_idx
            while j < len(gi) and gi.loc[j, 'timestamp'] <= t + window:
                sku = int(gi.loc[j, 'sku']) if 'sku' in gi.columns and not pd.isna(gi.loc[j,'sku']) else None
                if sku is not None and sku in prod_cat:
                    counts[int(row['url'])][int(prod_cat[sku])] += 1
                j += 1
    model = {}
    for url, c in counts.items():
        total = sum(c.values())
        if total==0: continue
        model[url] = {cat: cnt/total for cat,cnt in c.items()}
    return model

# -----------------------------
# Master items table
# -----------------------------

def build_item_table(prod: pd.DataFrame, propensity_sku: Optional[Dict[int,float]]=None, propensity_cat: Optional[Dict[int,float]]=None):
    items = prod[['sku','category','price','name']].copy()
    items['emb_vec'] = items['name'].apply(lambda x: decode_embedding(x,'item'))
    mat = np.vstack(items['emb_vec'].values)
    norms = np.linalg.norm(mat, axis=1, keepdims=True)+1e-8
    mat = mat / norms
    items['emb_vec'] = list(mat)

    if propensity_sku is not None:
        items['propensity_sku'] = items['sku'].map(lambda s: float(propensity_sku.get(int(s), 1e-3)))
    else:
        items['propensity_sku'] = 0.01
    if propensity_cat is not None:
        items['propensity_category'] = items['category'].map(lambda c: float(propensity_cat.get(int(c), 0.01)))
    else:
        items['propensity_category'] = 0.01
    return items

# -----------------------------
# User long-term embeddings
# -----------------------------

def build_user_long_term(allv: pd.DataFrame, items: pd.DataFrame) -> Dict[int, np.ndarray]:
    sku2vec = items.set_index('sku')['emb_vec'].to_dict()
    out = {}
    for uid, g in allv[allv['event_type'].isin(['buy','a2c'])].groupby('client_id'):
        vecs = [sku2vec[int(s)] for s in g['sku'].values if int(s) in sku2vec]
        if len(vecs)==0:
            out[int(uid)] = np.zeros(EMB_DIM_ITEM, dtype=np.float32)
        else:
            v = np.vstack(vecs).mean(axis=0)
            v /= (np.linalg.norm(v)+1e-8)
            out[int(uid)] = v.astype(np.float32)
    return out

# -----------------------------
# Candidate generator
# -----------------------------
class CandidateGenerator:
    def __init__(self, items: pd.DataFrame, popular_skus: np.ndarray):
        self.items = items
        self.popular = popular_skus
        self.item_mat = np.vstack(items['emb_vec'].values)
    def ann(self, q: np.ndarray, k: int) -> List[int]:
        qn = q / (np.linalg.norm(q)+1e-8)
        sims = (self.item_mat @ qn)
        idx = np.argpartition(-sims, kth=min(k, len(sims)-1))[:k]
        idx = idx[np.argsort(-sims[idx])]
        return [int(self.items.iloc[i]['sku']) for i in idx]
    def get_candidates(self, user_vec: np.ndarray, last_cat: Optional[int], exclude: set[int], k: int = CANDIDATE_POOL_SIZE) -> List[int]:
        cands = []
        if np.linalg.norm(user_vec) > 0:
            cands += self.ann(user_vec, k//2)
        for s in self.popular:
            if len(cands) >= k: break
            cands.append(int(s))
        out=[]; seen=set()
        for s in cands:
            if s in exclude or s in seen: continue
            seen.add(s); out.append(s)
            if len(out) >= k: break
        return out

# -----------------------------
# Feature builder (includes propensity)
# -----------------------------
FEATURE_DIM = 2*EMB_DIM_ITEM + 6  # user_lt + sess_vec + [sim_user, sim_sess, sim_q, div, propensity_sku, propensity_cat]
@dataclass
class SessionState:
    user_id: int
    session_id: int
    step: int
    user_lt: np.ndarray
    sess_vec: np.ndarray
    last_skus: deque
    last_cats: deque
    cart: Counter
    last_query: Optional[np.ndarray]
    last_url: Optional[int]
    last_recs: List[int]
    last_time: pd.Timestamp

class FeatureBuilder:
    def __init__(self, items: pd.DataFrame):
        self.sku2vec = items.set_index('sku')['emb_vec'].to_dict()
        self.sku2cat = items.set_index('sku')['category'].to_dict()
        self.sku2price = items.set_index('sku')['price'].to_dict()
        self.sku2prop = items.set_index('sku')['propensity_sku'].to_dict()
        self.cat2prop = items.set_index('sku')['propensity_category'].to_dict()
    def build(self, state: SessionState, candidates: List[int]) -> Tuple[np.ndarray, List[Dict[str,Any]]]:
        feats=[]; meta=[]
        for sku in candidates:
            ivec = self.sku2vec[sku]
            cat = self.sku2cat[sku]
            sim_user = cosine(state.user_lt, ivec)
            sim_sess = cosine(state.sess_vec, ivec)
            sim_q = cosine(state.last_query, ivec) if state.last_query is not None else 0.0
            div = 0.0
            if state.last_recs:
                div = float(min([1.0 - cosine(self.sku2vec[r], ivec) for r in state.last_recs]))
            p_sku = float(self.sku2prop.get(sku, 0.01))
            p_cat = float(self.cat2prop.get(sku, 0.01))
            f = np.concatenate([state.user_lt, state.sess_vec, np.array([sim_user, sim_sess, sim_q, div, p_sku, p_cat], dtype=np.float32)])
            feats.append(f)
            meta.append({'sku':sku,'cat':cat,'p_sku':p_sku,'p_cat':p_cat})
        X = np.vstack(feats).astype(np.float32)
        return X, meta

def cosine(a: np.ndarray, b: np.ndarray) -> float:
    if a is None or b is None: return 0.0
    na = np.linalg.norm(a); nb = np.linalg.norm(b)
    if na < 1e-8 or nb < 1e-8: return 0.0
    return float(np.dot(a,b)/(na*nb))

# -----------------------------
# Environment with dense rewards
# -----------------------------
class RLEnv:
    def __init__(self, allv: pd.DataFrame, items: pd.DataFrame, url_cat_model: Dict[int, Dict[int,float]], user_lt: Dict[int,np.ndarray]):
        self.allv = allv
        self.items = items
        self.url_cat_model = url_cat_model
        self.user_lt = user_lt
        self.sku2vec = items.set_index('sku')['emb_vec'].to_dict()
        self.sku2cat = items.set_index('sku')['category'].to_dict()
        self.sku2prop = items.set_index('sku')['propensity_sku'].to_dict()
        popular = allv[allv['event_type'].isin(['buy','a2c'])]['sku'].value_counts().index.values
        self.cgen = CandidateGenerator(items, popular)
        self.fbuilder = FeatureBuilder(items)
        self.sessions = list(allv.groupby(['client_id','session_id']).groups.keys())
        self.sessions_idx = 0
    def _init_state(self, client_id:int, session_id:int) -> SessionState:
        g = self.allv[(self.allv.client_id==client_id)&(self.allv.session_id==session_id)].reset_index(drop=True)
        t0 = g.loc[0,'timestamp']
        uvec = self.user_lt.get(int(client_id), np.zeros(EMB_DIM_ITEM, dtype=np.float32))
        st = SessionState(user_id=int(client_id), session_id=int(session_id), step=0, user_lt=uvec, sess_vec=np.zeros(EMB_DIM_ITEM,dtype=np.float32), last_skus=deque(maxlen=5), last_cats=deque(maxlen=5), cart=Counter(), last_query=None, last_url=None, last_recs=[], last_time=t0)
        st._events = g; st._cursor=0; st._done=False
        return st
    def _advance_once(self, state: SessionState):
        g = state._events
        if state._cursor >= len(g): state._done=True; return
        row = g.loc[state._cursor]; state._cursor += 1
        state.last_time = row['timestamp']
        et = row['event_type']
        if et in ('buy','a2c','rmc') and 'sku' in row and not pd.isna(row['sku']):
            sku = int(row['sku']);
            if sku in self.sku2vec:
                state.sess_vec = (state.sess_vec*0.7 + self.sku2vec[sku]*0.3).astype(np.float32)
                state.last_skus.append(sku); state.last_cats.append(int(self.sku2cat.get(sku, -1)))
            if et=='a2c': state.cart[sku]+=1
            if et=='rmc': state.cart[sku]-=1; state.cart.pop(sku,None)
        elif et=='qry':
            state.last_query = decode_embedding(row['query'],'query')
            state.sess_vec = (state.sess_vec*0.6 + state.last_query[:EMB_DIM_ITEM]*0.4).astype(np.float32)
        elif et=='pv':
            state.last_url = int(row['url'])
        state.step += 1
    def reset(self) -> SessionState:
        if not self.sessions: raise RuntimeError('no sessions')
        if self.sessions_idx >= len(self.sessions): self.sessions_idx = 0
        cid, sid = self.sessions[self.sessions_idx]; self.sessions_idx += 1
        st = self._init_state(cid, sid); self._advance_once(st); return st
    def candidate_features(self, state: SessionState):
        exclude = set(state.last_recs)
        last_cat = state.last_cats[-1] if state.last_cats else None
        cands = self.cgen.get_candidates(state.user_lt, last_cat, exclude, CANDIDATE_POOL_SIZE)
        X, meta = self.fbuilder.build(state, cands)
        return cands, X, meta
    def step(self, state: SessionState, action_sku: int) -> Tuple[SessionState, float, bool, Dict[str,Any]]:
        prev_time = state.last_time; prev_recs = list(state.last_recs); state.last_recs = (prev_recs + [action_sku])[-5:]
        g = state._events; reward=0.0
        if state._cursor < len(g):
            row = g.loc[state._cursor]; dt = (row['timestamp'] - prev_time).total_seconds(); credit = math.exp(-max(0.0,dt)/TIME_DECAY_TAU_SEC)
            if row['event_type'] in ('buy','a2c','rmc') and 'sku' in row and not pd.isna(row['sku']):
                sku = int(row['sku'])
                if row['event_type']=='buy' and sku==action_sku: reward += W_BUY*credit
                if row['event_type']=='a2c' and sku==action_sku: reward += W_A2C*credit
                if row['event_type']=='rmc' and sku==action_sku: reward -= W_RMC*credit
                act_cat = self.sku2cat.get(action_sku,None); ev_cat = self.sku2cat.get(sku,None)
                if act_cat is not None and ev_cat is not None and act_cat==ev_cat and row['event_type'] in ('buy','a2c'):
                    reward += W_CAT*credit
            elif row['event_type']=='pv':
                url = int(row['url']) if not pd.isna(row['url']) else None
                act_cat = self.sku2cat.get(action_sku,None)
                if url is not None and act_cat is not None and url in self.url_cat_model:
                    p = self.url_cat_model[url].get(act_cat,0.0); reward += W_PV * p * credit
            elif row['event_type']=='qry':
                qv = decode_embedding(row['query'],'query'); iv = self.sku2vec[action_sku]
                sim = max(0.0, cosine(qv, iv)); reward += W_QRY * sim * credit
        if prev_recs:
            iv = self.sku2vec[action_sku]; mind = min([1.0 - cosine(self.sku2vec[r], iv) for r in prev_recs])
            reward += W_DIV * mind
        if action_sku in prev_recs: reward -= W_REP
        self._advance_once(state)
        done = state._done or (state.step >= MAX_STEPS_PER_SESSION)
        return state, float(reward), done, {}

# -----------------------------
# Replay buffer
# -----------------------------
class ReplayBuffer:
    def __init__(self, cap=REPLAY_CAP): self.buf = deque(maxlen=cap)
    def push(self, s, a, r, ns, done): self.buf.append((s,a,r,ns,done))
    def sample(self, bs):
        idx = np.random.choice(len(self.buf), size=bs, replace=False)
        batch = [self.buf[i] for i in idx]
        s,a,r,ns,d = zip(*batch)
        return np.stack(s), np.array(a), np.array(r,dtype=np.float32), np.stack(ns), np.array(d,dtype=np.float32)
    def __len__(self): return len(self.buf)

# -----------------------------
# Model: Dueling MLP (per-candidate -> scalar Q)
# -----------------------------
class DuelingMLP(nn.Module):
    def __init__(self, in_dim:int, hidden:int=256):
        super().__init__();
        self.shared = nn.Sequential(nn.Linear(in_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden), nn.ReLU())
        self.V = nn.Linear(hidden,1); self.A = nn.Linear(hidden,1)
    def forward(self,x): h=self.shared(x); V=self.V(h); A=self.A(h); return V + (A - A.mean(dim=0,keepdim=True))

# -----------------------------
# Training and evaluation
# -----------------------------

def epsilon_by_step(step:int)->float:
    if step>=EPSILON_DECAY_STEPS: return EPSILON_END
    frac = step/EPSILON_DECAY_STEPS; return EPSILON_START + (EPSILON_END-EPSILON_START)*frac

def offline_eval(env:RLEnv, model:nn.Module, n_sessions:int=200, topk:int=TOPK)->Dict[str,float]:
    model.eval(); hits=0; ndcg=0.0; total=0; rew=0.0
    for _ in range(n_sessions):
        st = env.reset(); done=False
        while not done:
            cands,X,meta = env.candidate_features(st)
            with torch.no_grad(): q = model(torch.from_numpy(X).to(DEVICE)).squeeze(-1).cpu().numpy()
            order = np.argsort(-q); top = [cands[i] for i in order[:topk]]
            best = top[0] if len(top)>0 else cands[0]
            st, r, done, _ = env.step(st, best); rew += r
            g = st._events; cur = st._cursor-1
            if 0<=cur<len(g):
                row = g.loc[cur]
                if row['event_type'] in ('buy','a2c') and not pd.isna(row.get('sku', np.nan)):
                    sku = int(row['sku'])
                    total += 1
                    if sku in top:
                        hits += 1; rank = top.index(sku)+1; ndcg += 1.0/math.log2(rank+1)
    return {'HR@K': hits/max(1,total), 'NDCG@K': ndcg/max(1,total), 'Reward/session': rew/max(1,n_sessions)}

def train_loop(env:RLEnv):
    policy = DuelingMLP(FEATURE_DIM).to(DEVICE); target = DuelingMLP(FEATURE_DIM).to(DEVICE); target.load_state_dict(policy.state_dict())
    opt = torch.optim.Adam(policy.parameters(), lr=LR); buf = ReplayBuffer()
    step=0; st = env.reset()
    while step < TRAIN_STEPS:
        cands,X,meta = env.candidate_features(st)
        X_t = torch.from_numpy(X).to(DEVICE)
        eps = epsilon_by_step(step)
        with torch.no_grad(): qvals = policy(X_t).squeeze(-1).cpu().numpy()
        if np.random.rand() < eps: a_idx = np.random.randint(len(cands))
        else: a_idx = int(np.argmax(qvals))
        action_sku = cands[a_idx]
        st_next, reward, done, _ = env.step(st, action_sku)
        ncands,nX,nmeta = env.candidate_features(st_next)
        # store chosen feature row and next-state candidate matrix (fixed-size)
        s_row = X[a_idx]
        # pad/truncate nX to fixed CANDIDATE_POOL_SIZE
        def fix_stack(mat, target_c=CANDIDATE_POOL_SIZE):
            c = mat.shape[0]
            if c >= target_c: return mat[:target_c]
            pad = np.zeros((target_c-c, mat.shape[1]), dtype=np.float32)
            return np.vstack([mat, pad])
        ns_fixed = fix_stack(nX)
        buf.push(s_row, 0, reward, ns_fixed, float(done))
        if len(buf) >= BATCH_SIZE:
            s_b,a_b,r_b,ns_b,d_b = buf.sample(BATCH_SIZE)
            s_b = torch.from_numpy(s_b).to(DEVICE); r_b = torch.from_numpy(r_b).to(DEVICE); d_b = torch.from_numpy(d_b).to(DEVICE)
            ns_b_t = torch.from_numpy(ns_b.reshape(BATCH_SIZE, CANDIDATE_POOL_SIZE, FEATURE_DIM)).to(DEVICE)
            with torch.no_grad():
                q_next_policy = policy(ns_b_t.view(-1,FEATURE_DIM)).view(BATCH_SIZE, CANDIDATE_POOL_SIZE)
                a_star = torch.argmax(q_next_policy, dim=1, keepdim=True)
                q_next_target = target(ns_b_t.view(-1,FEATURE_DIM)).view(BATCH_SIZE, CANDIDATE_POOL_SIZE)
                q_next = torch.gather(q_next_target, 1, a_star).squeeze(1)
                y = r_b + (1.0 - d_b) * GAMMA * q_next
            q_curr = policy(s_b).squeeze(-1)
            loss = F.mse_loss(q_curr, y)
            opt.zero_grad(); loss.backward(); opt.step(); nn.utils.clip_grad_norm_(policy.parameters(), 5.0)
            if step % TARGET_UPDATE_FREQ == 0:
                target.load_state_dict(policy.state_dict())
        st = st_next
        if done: st = env.reset()
        if (step % EVAL_EVERY == 0) and step>0:
            metrics = offline_eval(env, policy, n_sessions=50, topk=TOPK)
            print(f"[step {step}] eps={eps:.3f} metrics={metrics}")
        step += 1
    return policy

# -----------------------------
# Main orchestration
# -----------------------------



In [None]:
data['product_properties']

In [6]:
data["product_properties"].columns

Index(['sku', 'category', 'price', 'name'], dtype='object')

#### SKU stands Stock Keeping Unit, is a unique alphanumeric code assigned to a product for inventory management purposes. It helps businesses track inventory, analyze sales, and manage stock levels effectively.

In [7]:
data["product_buy"].columns

Index(['client_id', 'timestamp', 'sku'], dtype='object')

In [8]:
data["add_to_cart"].columns

Index(['client_id', 'timestamp', 'sku'], dtype='object')

In [9]:
data["remove_from_cart"].columns

Index(['client_id', 'timestamp', 'sku'], dtype='object')

In [10]:
data["page_visit"].columns

Index(['client_id', 'timestamp', 'url'], dtype='object')

In [11]:
data["search_query"].columns

Index(['client_id', 'timestamp', 'query'], dtype='object')

What this function does (step-by-step)

Standardizes data types

Ensures sku, category, price, and client_id are integers.

Converts timestamp strings into proper datetime objects using parse_ts.

Sorts each event table

Orders rows by client_id then by timestamp.

This helps when grouping events into sessions.

Combines all events into one big table

Creates a column event_type with values like "buy", "a2c", "rmc", "pv", "qry".

Merges product_buy, add_to_cart, remove_from_cart, page_visit, and search_query into one dataframe (allv).

Creates user sessions

Defines a session gap of SESSION_GAP_MIN minutes (you’ll need to set this constant somewhere, e.g., SESSION_GAP_MIN = 30).

If a user’s next event is more than SESSION_GAP_MIN minutes after the last one, it starts a new session.

Adds session_id (per user) and step (step number within that session).

Returns a unified, sessionized dataframe

Each row is now:
client_id | timestamp | sku/url/query | event_type | session_id | step

**Given a URL, what’s the distribution of product categories people looked at within 10 minutes of visiting it?**

# Version - 2

In [1]:
# RL Recommender – End-to-End (No Propensity)
# -------------------------------------------------------------
# What this file includes
#   1) Data prep: clean + sessionize events (robust to missing cols)
#   2) Optional downsampling: top-N popular SKUs and top-N active users
#   3) Popularity model (global & per-category)
#   4) URL→Category model (fast-ish with NumPy + tqdm)
#   5) Candidate pool builder
#   6) RL dataset builder (offline transitions from sessions)
#   7) Dueling Double DQN agent + training loop
#   8) Simple evaluation: HR@K, NDCG@K
#
# Notes
# - This version intentionally DOES NOT use propensity files.
# - Assumes input `data` dict has keys: 'product_buy', 'add_to_cart',
#   'remove_from_cart', 'page_visit', 'search_query', 'product_properties'.
# - You can run pieces independently if your RAM is tight (page_visit is large).

from __future__ import annotations
import os
from typing import Dict, Any, List, Tuple, Optional
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim

# -----------------------------
# Data loading
# -----------------------------

def load_parquets_and_targets(data_dir: str) -> Dict[str, Any]:
    """Load parquet event files and target propensity numpy arrays.
    Expects files described earlier.
    """
    if USE_SYNTHETIC:
        return make_synth_data_for_pipeline()

    def rp(fname):
        p = os.path.join(data_dir, fname)
        if not os.path.exists(p):
            raise FileNotFoundError(p)
        return pd.read_parquet(p)

    data = {}
    data['product_properties'] = rp('product_properties.parquet')
    data['product_buy'] = rp('product_buy.parquet')
    data['add_to_cart'] = rp('add_to_cart.parquet')
    data['remove_from_cart'] = rp('remove_from_cart.parquet')
    data['page_visit'] = rp('page_visit.parquet')
    data['search_query'] = rp('search_query.parquet')

    # load numpy targets if present
    targ_dir = os.path.join(data_dir, 'target')
    if os.path.isdir(targ_dir):
        def ln(p):
            f = os.path.join(p)
            if os.path.exists(f):
                return np.load(f, allow_pickle=True)
            return None
        data['propensity_sku'] = None
        data['propensity_category'] = None
        sku_p = os.path.join(targ_dir, 'popularity_propensity_sku.npy')
        cat_p = os.path.join(targ_dir, 'popularity_propensity_category.npy')
        if os.path.exists(sku_p):
            data['propensity_sku'] = np.load(sku_p, allow_pickle=True) if os.path.getsize(sku_p)>0 else None
        if os.path.exists(cat_p):
            data['propensity_category'] = np.load(cat_p, allow_pickle=True) if os.path.getsize(cat_p)>0 else None
    else:
        data['propensity_sku'] = None
        data['propensity_category'] = None

    return data


# -----------------------------
# 1) CLEAN + SESSIONIZE
# -----------------------------

def clean_and_sessionize(data: Dict[str, Any], session_gap_min: int = 30) -> pd.DataFrame:
    """
    Build a single sessionized event table from mixed event DataFrames.
    Keeps only essential columns and is robust to missing columns/arrays.

    Returns columns:
      ['client_id','timestamp','sku','url','query','event_type','session_id','step']
    """
    event_map = {
        'product_buy': 'buy',
        'add_to_cart': 'a2c',
        'remove_from_cart': 'rmc',
        'page_visit': 'pv',
        'search_query': 'qry'
    }

    frames = []
    for key in ['product_buy','add_to_cart','remove_from_cart','page_visit','search_query']:
        val = data.get(key)
        if val is None or not isinstance(val, pd.DataFrame) or len(val) == 0:
            continue
        df = val.copy()

        # Ensure timestamp & client_id exist
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True, errors='coerce')
        else:
            df['timestamp'] = pd.NaT
        if 'client_id' not in df.columns:
            df['client_id'] = pd.NA

        # Normalize payload columns
        df['sku'] = df['sku'] if 'sku' in df.columns else pd.NA
        df['url'] = df['url'] if 'url' in df.columns else pd.NA
        df['query'] = df['query'] if 'query' in df.columns else pd.NA

        # Coerce client_id numeric
        df['client_id'] = pd.to_numeric(df['client_id'], errors='coerce')

        df['event_type'] = event_map.get(key, key)
        df = df[['client_id','timestamp','sku','url','query','event_type']]
        frames.append(df)

    if not frames:
        return pd.DataFrame(columns=['client_id','timestamp','sku','url','query','event_type','session_id','step'])

    allv = pd.concat(frames, ignore_index=True)
    allv.dropna(subset=['client_id','timestamp'], inplace=True)
    allv['client_id'] = allv['client_id'].astype('int64')

    allv.sort_values(['client_id','timestamp'], inplace=True)

    gap = pd.Timedelta(minutes=session_gap_min)
    is_break = allv.groupby('client_id')['timestamp'].diff().gt(gap)
    is_break = is_break.fillna(True)
    sess_num = is_break.groupby(allv['client_id']).cumsum().astype(int)
    allv['session_id'] = allv['client_id'].astype(str) + '_' + sess_num.astype(str)
    allv['step'] = allv.groupby('session_id').cumcount() + 1

    return allv.reset_index(drop=True)

# ----------------------------------
# 2) OPTIONAL DOWNSAMPLING FUNCTIONS
# ----------------------------------

def filter_top_products(data: dict, top_n: int = 30000) -> dict:
    """Keep only events for top-N popular SKUs (by counts in buy+a2c+remove)."""
    sku_counts = pd.Series(dtype=int)

    for key in ['product_buy', 'add_to_cart', 'remove_from_cart']:
        df = data.get(key)
        if isinstance(df, pd.DataFrame) and 'sku' in df.columns:
            sku_counts = sku_counts.add(df['sku'].value_counts(), fill_value=0)

    if sku_counts.empty:
        return data.copy()

    popular_skus = set(sku_counts.sort_values(ascending=False).head(top_n).index)

    out = {}
    for k, v in data.items():
        if isinstance(v, pd.DataFrame) and 'sku' in v.columns:
            out[k] = v[v['sku'].isin(popular_skus)]  # no .copy()
        else:
            out[k] = v
    return out


def filter_top_users(data: dict, top_n_users: int = 50000) -> dict:
    """Keep only top-N active users across all events (by total interactions)."""
    user_activity = pd.Series(dtype=int)

    for key in ['product_buy', 'add_to_cart', 'remove_from_cart', 'page_visit', 'search_query']:
        df = data.get(key)
        if isinstance(df, pd.DataFrame) and 'client_id' in df.columns:
            user_activity = user_activity.add(df['client_id'].value_counts(), fill_value=0)

    if user_activity.empty:
        return data.copy()

    top_users = set(user_activity.sort_values(ascending=False).head(top_n_users).index)

    out = {}
    for k, v in data.items():
        if isinstance(v, pd.DataFrame) and 'client_id' in v.columns:
            out[k] = v[v['client_id'].isin(top_users)]  # no .copy()
        else:
            out[k] = v
    return out



# -----------------------------
# 3) POPULARITY MODEL
# -----------------------------

def build_popularity_model(sessionized: pd.DataFrame, product_properties: Optional[pd.DataFrame] = None) -> pd.DataFrame:
    """Return df with sku, count, optional category/price from product_properties."""
    # Count interactions for items (buy + a2c prioritized; fall back to any sku event)
    item_df = sessionized[sessionized['sku'].notna()].copy()
    item_df['sku'] = item_df['sku'].astype('int64')
    weights = {'buy': 3.0, 'a2c': 1.5, 'rmc': -1.0, 'pv': 0.2, 'qry': 0.0}
    item_df['w'] = item_df['event_type'].map(weights).fillna(0.0)

    pop = (
        item_df.groupby('sku')['w']
        .sum()
        .reset_index()
        .rename(columns={'w':'score'})
        .sort_values('score', ascending=False)
        .reset_index(drop=True)
    )

    if isinstance(product_properties, pd.DataFrame) and not product_properties.empty:
        meta = product_properties[['sku','category','price']].drop_duplicates('sku')
        pop = pop.merge(meta, on='sku', how='left')
    return pop

# ---------------------------------------------
# 4) URL → CATEGORY MODEL (FAST W/ TQDM)
# ---------------------------------------------

def build_url_category_model_fast(
    pv: pd.DataFrame,
    item_events: pd.DataFrame,
    product_properties: pd.DataFrame,
    window_min: int = 10
) -> Dict[int, Dict[int, float]]:
    """
    For each URL, estimate distribution over product categories by
    looking at item events from the same client that occur within
    [pv_time, pv_time + window]. Faster than nested iterrows.
    """
    if pv is None or item_events is None or len(pv) == 0 or len(item_events) == 0:
        return {}

    # Map sku→category upfront
    prod_cat = product_properties.set_index('sku')['category'].to_dict()

    # Keep only necessary columns
    pv = pv[['client_id','timestamp','url']].copy()
    item_events = item_events[['client_id','timestamp','sku']].copy()

    # Ensure dtypes
    pv['timestamp'] = pd.to_datetime(pv['timestamp'], utc=True, errors='coerce')
    item_events['timestamp'] = pd.to_datetime(item_events['timestamp'], utc=True, errors='coerce')

    pv.dropna(subset=['client_id','timestamp','url'], inplace=True)
    item_events.dropna(subset=['client_id','timestamp','sku'], inplace=True)

    pv.sort_values(['client_id','timestamp'], inplace=True)
    item_events.sort_values(['client_id','timestamp'], inplace=True)

    window = pd.Timedelta(minutes=window_min)
    counts: Dict[int, Counter] = defaultdict(Counter)

    # Group once to avoid repeated boolean indexing
    item_groups = {cid: g for cid, g in item_events.groupby('client_id')}

    for cid, gpv in tqdm(pv.groupby('client_id'), desc='URL→Category: clients'):
        gi = item_groups.get(cid)
        if gi is None or gi.empty:
            continue
        gi_times = gi['timestamp'].to_numpy()
        gi_skus = gi['sku'].to_numpy()

        # two-pointer style scan per pageview timestamp using vectorized masking
        for t, url in zip(gpv['timestamp'].to_numpy(), gpv['url'].to_numpy()):
            # mask for [t, t+window]
            mask = (gi_times >= t) & (gi_times <= t + window)
            if not mask.any():
                continue
            for sku in gi_skus[mask]:
                if pd.isna(sku):
                    continue
                sku_int = int(sku)
                cat = prod_cat.get(sku_int)
                if cat is not None:
                    counts[int(url)][int(cat)] += 1

    # Normalize to probabilities per URL
    model: Dict[int, Dict[int, float]] = {}
    for url, c in counts.items():
        total = sum(c.values())
        if total <= 0:
            continue
        model[url] = {cat: cnt/total for cat, cnt in c.items()}
    return model

# ----------------------------------
# 5) CANDIDATE POOLS
# ----------------------------------

def build_candidate_pool(popularity_df: pd.DataFrame, top_k: int = 5000) -> np.ndarray:
    """Return array of top-K SKU ids to restrict action space."""
    skus = popularity_df['sku'].head(top_k).astype('int64').to_numpy()
    return skus

# ----------------------------------
# 6) RL DATASET (OFFLINE TRANSITIONS)
# ----------------------------------

def make_offline_transitions(
    sessionized: pd.DataFrame,
    max_hist: int,
    event_reward: Optional[Dict[str, float]] = None,
    candidate_skus: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Create (s, a, r, s_next, done) from sessionized logs.
    - State: last `max_hist` item ids (zero-padded), integers
    - Action: the *actual* next item sku (behavior policy)
    - Reward: mapped from that next event type
    - done: session end flag
    """
    if event_reward is None:
        event_reward = {'buy': 1.0, 'a2c': 0.5, 'rmc': -0.2, 'pv': 0.05, 'qry': 0.0}

    seq = sessionized.copy()
    seq = seq[seq['sku'].notna()].copy()
    seq['sku'] = seq['sku'].astype('int64')

    # Optionally restrict to candidates
    if candidate_skus is not None:
        cand_set = set(candidate_skus.tolist())
        seq = seq[seq['sku'].isin(cand_set)]

    states, actions, rewards, next_states, dones = [], [], [], [], []

    for sid, g in tqdm(seq.groupby('session_id'), desc='Building offline transitions'):
        # For this simple builder, we use only item events within the session
        g = g[['client_id','timestamp','sku','event_type','step']].sort_values('timestamp')
        hist: List[int] = []

        for i in range(len(g)-1):
            # build state from history up to i (exclusive)
            cur_sku = int(g.iloc[i]['sku'])
            hist.append(cur_sku)
            if len(hist) > max_hist:
                hist = hist[-max_hist:]
            state = np.zeros(max_hist, dtype=np.int64)
            state[-len(hist):] = np.array(hist, dtype=np.int64)

            # next event defines action & reward
            nxt = g.iloc[i+1]
            action = int(nxt['sku'])
            reward = float(event_reward.get(str(nxt['event_type']), 0.0))

            # build next_state (history including next action)
            hist_next = hist + [action]
            if len(hist_next) > max_hist:
                hist_next = hist_next[-max_hist:]
            next_state = np.zeros(max_hist, dtype=np.int64)
            next_state[-len(hist_next):] = np.array(hist_next, dtype=np.int64)

            done = (i+1 == len(g)-1)

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)

    if not states:
        return (np.zeros((0,max_hist),dtype=np.int64),
                np.zeros((0,),dtype=np.int64),
                np.zeros((0,),dtype=np.float32),
                np.zeros((0,max_hist),dtype=np.int64),
                np.zeros((0,),dtype=np.bool_))

    return (
        np.stack(states),
        np.array(actions, dtype=np.int64),
        np.array(rewards, dtype=np.float32),
        np.stack(next_states),
        np.array(dones, dtype=np.bool_)
    )

# ----------------------------------
# 7) DQN (Dueling + Double)
# ----------------------------------

class DuelingDQN(nn.Module):
    def __init__(self, num_items: int, emb_dim: int = 64, hist_len: int = 20, hidden: int = 256):
        super().__init__()
        self.num_items = num_items
        self.hist_len = hist_len
        self.item_emb = nn.Embedding(num_items + 1, emb_dim, padding_idx=0)
        self.backbone = nn.Sequential(
            nn.Linear(hist_len * emb_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
        )
        # Dueling streams
        self.V = nn.Sequential(nn.Linear(hidden, 128), nn.ReLU(), nn.Linear(128, 1))
        self.A = nn.Sequential(nn.Linear(hidden, 128), nn.ReLU(), nn.Linear(128, num_items))

    def forward(self, state_idx: torch.LongTensor) -> torch.Tensor:
        # state_idx: [B, hist_len] integer item ids (0 is padding)
        emb = self.item_emb(state_idx)               # [B, hist_len, emb]
        flat = emb.view(emb.size(0), -1)             # [B, hist_len*emb]
        h = self.backbone(flat)
        V = self.V(h)                                # [B, 1]
        A = self.A(h)                                # [B, num_items]
        Q = V + A - A.mean(dim=1, keepdim=True)
        return Q

# Utilities for mapping SKU ids ↔ action indices
class ActionSpace:
    def __init__(self, candidate_skus: np.ndarray):
        self.skus = candidate_skus.astype('int64')
        self.sku2idx = {int(s): i for i, s in enumerate(self.skus)}
    def to_index(self, sku_array: np.ndarray) -> np.ndarray:
        return np.array([self.sku2idx.get(int(s), -1) for s in sku_array], dtype=np.int64)

# Training step
@torch.no_grad()
def compute_td_target(target_net, next_states, rewards, dones, gamma):
    # Double DQN: action selection by online net, evaluation by target net
    # (handled in train loop)
    pass  # kept here for clarity; implemented inline in train()


def train_dqn_old(
    transitions: Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray],
    action_space: ActionSpace,
    hist_len: int = 20,
    emb_dim: int = 64,
    hidden: int = 256,
    gamma: float = 0.99,
    lr: float = 1e-3,
    batch_size: int = 1024,
    epochs: int = 5,
    target_sync: int = 1000,
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
):
    states, actions_sku, rewards, next_states, dones = transitions
    # Map actions from SKU ids to action indices within candidate set
    idx_actions = action_space.to_index(actions_sku)
    valid_mask = idx_actions >= 0

    states = torch.from_numpy(states[valid_mask]).to(device)
    next_states = torch.from_numpy(next_states[valid_mask]).to(device)
    actions = torch.from_numpy(idx_actions[valid_mask]).long().to(device)
    rewards = torch.from_numpy(rewards[valid_mask]).float().to(device)
    dones = torch.from_numpy(dones[valid_mask]).bool().to(device)

    num_items = len(action_space.skus)

    online = DuelingDQN(num_items, emb_dim, hist_len, hidden).to(device)
    target = DuelingDQN(num_items, emb_dim, hist_len, hidden).to(device)
    target.load_state_dict(online.state_dict())
    target.eval()

    opt = optim.Adam(online.parameters(), lr=lr)
    mse = nn.MSELoss()

    N = states.size(0)
    idx = np.arange(N)
    step = 0

    for ep in range(epochs):
        np.random.shuffle(idx)
        for start in tqdm(range(0, N, batch_size), desc=f'Epoch {ep+1}/{epochs}'):
            step += 1
            batch = idx[start:start+batch_size]
            s = states[batch]
            a = actions[batch]
            r = rewards[batch]
            ns = next_states[batch]
            d = dones[batch]

            q = online(s)                           # [B, num_items]
            q_a = q.gather(1, a.unsqueeze(1)).squeeze(1)

            with torch.no_grad():
                # Double DQN
                next_q_online = online(ns)          # action selection
                best_a = next_q_online.argmax(dim=1, keepdim=True)
                next_q_target = target(ns)          # action evaluation
                next_q = next_q_target.gather(1, best_a).squeeze(1)
                y = r + (~d).float() * gamma * next_q

            loss = mse(q_a, y)
            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(online.parameters(), 1.0)
            opt.step()

            if step % target_sync == 0:
                target.load_state_dict(online.state_dict())

    return online, target

# ----------------------------------
# 8) EVALUATION
# ----------------------------------

def rank_topk(q_values: torch.Tensor, k: int) -> np.ndarray:
    topk = torch.topk(q_values, k=k, dim=1).indices.detach().cpu().numpy()
    return topk


def eval_hr_ndcg(
    model: nn.Module,
    transitions: Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray],
    action_space: ActionSpace,
    hist_len: int = 20,
    k: int = 10,
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
) -> Tuple[float, float]:
    states, actions_sku, _, _, _ = transitions
    idx_actions = action_space.to_index(actions_sku)
    valid = idx_actions >= 0

    states = torch.from_numpy(states[valid]).to(device)
    true_a = torch.from_numpy(idx_actions[valid]).long().to(device)

    with torch.no_grad():
        q = model(states)
        topk = torch.topk(q, k=k, dim=1).indices
        hit = (topk == true_a.unsqueeze(1)).any(dim=1).float().mean().item()

        # NDCG@K
        gains = torch.where(topk == true_a.unsqueeze(1), 1.0, 0.0)
        # positions 1..K → log2(1+pos)
        denom = torch.log2(torch.arange(2, k+2, device=gains.device).float())
        dcg = (gains / denom).sum(dim=1)
        idcg = torch.tensor([1.0/denom[0].item()], device=gains.device).repeat(len(dcg))
        ndcg = (dcg / idcg).mean().item()

    return hit, ndcg


In [2]:
# -----------------------------
# Config
# -----------------------------
USE_SYNTHETIC = False
DATA_DIR = "/kaggle/working/ubc_data_extracted"  # <- set this to your extracted folder path
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Embedding dims
EMB_DIM_ITEM = 128
EMB_DIM_QUERY = 128
EMB_DIM_URL = 64

# Sessionization
SESSION_GAP_MIN = 30
MAX_STEPS_PER_SESSION = 20

# Candidate gen
CANDIDATE_POOL_SIZE = 200

# Reward weights (dense reward)
W_BUY = 5.0
W_A2C = 2.0
W_RMC = 1.0
W_CAT = 0.5
W_PV = 0.2
W_QRY = 0.2
W_DIV = 0.1
W_REP = 0.2
TIME_DECAY_TAU_SEC = 60.0
GAMMA = 0.9

# Training
BATCH_SIZE = 128
REPLAY_CAP = 200_000
LR = 1e-3
EPSILON_START = 0.2
EPSILON_END = 0.05
EPSILON_DECAY_STEPS = 50_000
TARGET_UPDATE_FREQ = 2_000
TRAIN_STEPS = 50_000
EVAL_EVERY = 5_000
TOPK = 10

In [3]:
DATA_DIR = "/kaggle/working/ubc_data_extracted"

import time

st = time.time()
print('Loading data...')
data = load_parquets_and_targets(DATA_DIR)
en = time.time()
print((en-st)/60)

Loading data...
0.17840198278427125


In [4]:
for key, value in data.items():
    if isinstance(value, pd.DataFrame):
        print(f"{key}: shape={value.shape}")
    elif isinstance(value, np.ndarray):
        print(f"{key}: ndarray shape={value.shape}")
    else:
        print(f"{key}: type={type(value)}, value={value}")

product_properties: shape=(1260365, 4)
product_buy: shape=(1775394, 3)
add_to_cart: shape=(5674064, 3)
remove_from_cart: shape=(1937170, 3)
page_visit: shape=(156032014, 3)
search_query: shape=(10218831, 3)
propensity_sku: ndarray shape=(100,)
propensity_category: ndarray shape=(100,)


In [5]:
st = time.time()
data_small = filter_top_products(data, top_n=5000)
data_small = filter_top_users(data_small, top_n_users=100000)
en = time.time()
print((en-st)/60)
print("DONE !!!")

0.569500990708669
DONE !!!


In [6]:
for key, value in data_small.items():
    if isinstance(value, pd.DataFrame):
        print(f"{key}: shape={value.shape}")
    elif isinstance(value, np.ndarray):
        print(f"{key}: ndarray shape={value.shape}")
    else:
        print(f"{key}: type={type(value)}, value={value}")

product_properties: shape=(5000, 4)
product_buy: shape=(94748, 3)
add_to_cart: shape=(375782, 3)
remove_from_cart: shape=(173640, 3)
page_visit: shape=(33535779, 3)
search_query: shape=(4039900, 3)
propensity_sku: ndarray shape=(100,)
propensity_category: ndarray shape=(100,)


In [7]:
# 2) Sessionize
import time

st = time.time()
allv = clean_and_sessionize(data_small, session_gap_min=60)
en = time.time()
print((en-st)/60)

1.9646497964859009


In [8]:
# 3) Popularity + candidates

st = time.time()
pop = build_popularity_model(allv, data_small.get('product_properties'))
candidates = build_candidate_pool(pop, top_k=50000)
en = time.time()
print((en-st)/60)

0.01499934991200765


In [9]:
# 4) URL→Category model (optional, for features or heuristics)

item_events = pd.concat([
     data_small.get('product_buy', pd.DataFrame()),
     data_small.get('add_to_cart', pd.DataFrame())
 ], ignore_index=True)
url_cat = build_url_category_model_fast(data_small.get('page_visit', pd.DataFrame()), item_events, data_small['product_properties'], window_min=10)

URL→Category: clients: 100%|██████████| 99997/99997 [06:05<00:00, 273.71it/s]


In [10]:
# 5) Offline transitions for RL
#st = time.time()
trans = make_offline_transitions(allv, max_hist=20, candidate_skus=candidates)
#en = time.time()

#print((en-st)/60)

Building offline transitions: 100%|██████████| 207753/207753 [03:11<00:00, 1086.41it/s]


In [11]:
# Create action space mapping
action_space = ActionSpace(candidates)
print(f"Action space size: {len(action_space.skus)}")

# Get transitions
states, actions_sku, rewards, next_states, dones = trans

# Map actions to candidate space indices
actions_mapped = action_space.to_index(actions_sku)
valid_mask = actions_mapped >= 0  # Keep only valid actions

# Apply valid mask to all arrays
states_valid = states[valid_mask]
actions_valid = actions_mapped[valid_mask]
rewards_valid = rewards[valid_mask]
next_states_valid = next_states[valid_mask]
dones_valid = dones[valid_mask]

print(f"Valid transitions: {len(states_valid)}")

# Map states and next_states to candidate space
def map_state_to_candidates(state_array):
    """Map state array (containing SKU IDs) to candidate indices"""
    mapped = np.zeros_like(state_array, dtype=np.int64)
    for i in range(state_array.shape[0]):  # for each sequence
        for j in range(state_array.shape[1]):  # for each position in sequence
            sku = state_array[i, j]
            if sku == 0:  # padding
                mapped[i, j] = 0
            else:
                idx = action_space.sku2idx.get(int(sku), -1)
                if idx >= 0:
                    mapped[i, j] = idx + 1  # +1 to reserve 0 for padding
                else:
                    mapped[i, j] = 0  # unknown SKU -> padding
    return mapped

print("Mapping states...")
states_mapped = map_state_to_candidates(states_valid)

print("Mapping next_states...")
next_states_mapped = map_state_to_candidates(next_states_valid)

# Adjust actions to account for +1 offset (since 0 is padding in embedding)
actions_final = actions_valid + 1

print(f"Final ranges:")
print(f"States: {states_mapped.min()}-{states_mapped.max()}")
print(f"Actions: {actions_final.min()}-{actions_final.max()}")
print(f"Next states: {next_states_mapped.min()}-{next_states_mapped.max()}")

# Embedding size should be num_candidates + 1 (for padding)
num_items_embedding = len(action_space.skus) + 1
num_items_actions = len(action_space.skus)

print(f"Embedding size: {num_items_embedding}")
print(f"Action space size: {num_items_actions}")

# Create transitions tuple
trans_final = (states_mapped, actions_final, rewards_valid, next_states_mapped, dones_valid)

# Modified DQN class to handle the size mismatch
class DQN_Fixed(nn.Module):
    def __init__(self, num_items_embedding, num_items_actions, emb_dim=64, hidden=256):
        super().__init__()
        self.embedding = nn.Embedding(num_items_embedding, emb_dim, padding_idx=0)
        self.fc1 = nn.Linear(emb_dim, hidden)
        self.fc2 = nn.Linear(hidden, hidden)
        self.fc_out = nn.Linear(hidden, num_items_actions)  # Output only valid actions

    def forward(self, x):
        # x: [batch, hist_len]
        x = self.embedding(x)                 # [batch, hist_len, emb_dim]
        x = x.mean(dim=1)                     # aggregate history
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc_out(x)                 # [batch, num_items_actions]

# Training function with fixed sizes and tqdm progress bars
def train_dqn_tqdm(transitions, num_items_embedding, num_items_actions, hist_len=20, emb_dim=64, hidden=256,
                    gamma=0.99, lr=1e-3, batch_size=64, epochs=3, device="cpu"):
    
    states, actions, rewards, next_states, dones = transitions
    
    # Actions should be in range [1, num_items_actions] (since we added +1)
    # We need to convert them back to [0, num_items_actions-1] for gather operation
    actions = actions - 1  # Convert back to 0-indexed for gather
    
    # Initialize networks
    online = DQN_Fixed(num_items_embedding, num_items_actions, emb_dim, hidden).to(device)
    target = DQN_Fixed(num_items_embedding, num_items_actions, emb_dim, hidden).to(device)
    target.load_state_dict(online.state_dict())
    
    optimizer = optim.Adam(online.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    
    N = len(states)
    num_batches = (N + batch_size - 1) // batch_size  # Calculate total batches per epoch
    
    # Outer progress bar for epochs
    epoch_pbar = tqdm(range(epochs), desc="Training DQN", unit="epoch")
    
    for epoch in epoch_pbar:
        idx = np.random.permutation(N)
        total_loss = 0.0
        
        # Inner progress bar for batches within each epoch
        batch_pbar = tqdm(range(0, N, batch_size), 
                         desc=f"Epoch {epoch+1}/{epochs}", 
                         unit="batch",
                         leave=False)  # Don't leave the inner progress bar after completion
        
        for i in batch_pbar:
            batch_idx = idx[i:i+batch_size]
            
            b_states = torch.tensor(states[batch_idx], dtype=torch.long, device=device)
            b_actions = torch.tensor(actions[batch_idx], dtype=torch.long, device=device)
            b_rewards = torch.tensor(rewards[batch_idx], dtype=torch.float, device=device)
            b_next_states = torch.tensor(next_states[batch_idx], dtype=torch.long, device=device)
            b_dones = torch.tensor(dones[batch_idx], dtype=torch.float, device=device)
            
            # Q(s,a)
            q_values = online(b_states).gather(1, b_actions.unsqueeze(1)).squeeze(1)
            
            # target Q
            with torch.no_grad():
                max_next_q = target(b_next_states).max(1)[0]
                target_q = b_rewards + gamma * max_next_q * (1 - b_dones)
            
            # Loss + update
            loss = loss_fn(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            batch_loss = loss.item()
            total_loss += batch_loss
            
            # Update batch progress bar with current loss
            batch_pbar.set_postfix({
                'batch_loss': f'{batch_loss:.4f}',
                'avg_loss': f'{total_loss / ((i // batch_size) + 1):.4f}'
            })
        
        # Close batch progress bar
        batch_pbar.close()
        
        # Update epoch progress bar with epoch summary
        avg_epoch_loss = total_loss / num_batches
        epoch_pbar.set_postfix({
            'epoch_loss': f'{avg_epoch_loss:.4f}',
            'total_loss': f'{total_loss:.4f}'
        })
        
        # Update target network
        target.load_state_dict(online.state_dict())
    
    # Close epoch progress bar
    epoch_pbar.close()
    
    return online, target

# Training function with fixed sizes
def train_dqn_fixed(transitions, num_items_embedding, num_items_actions, hist_len=20, emb_dim=64, hidden=256,
                    gamma=0.99, lr=1e-3, batch_size=64, epochs=3, device="cpu"):

    states, actions, rewards, next_states, dones = transitions

    # Actions should be in range [1, num_items_actions] (since we added +1)
    # We need to convert them back to [0, num_items_actions-1] for gather operation
    actions = actions - 1  # Convert back to 0-indexed for gather

    # Initialize networks
    online = DQN_Fixed(num_items_embedding, num_items_actions, emb_dim, hidden).to(device)
    target = DQN_Fixed(num_items_embedding, num_items_actions, emb_dim, hidden).to(device)
    target.load_state_dict(online.state_dict())

    optimizer = optim.Adam(online.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    N = len(states)

    for epoch in range(epochs):
        idx = np.random.permutation(N)
        total_loss = 0.0

        for i in range(0, N, batch_size):
            batch_idx = idx[i:i+batch_size]

            b_states = torch.tensor(states[batch_idx], dtype=torch.long, device=device)
            b_actions = torch.tensor(actions[batch_idx], dtype=torch.long, device=device)
            b_rewards = torch.tensor(rewards[batch_idx], dtype=torch.float, device=device)
            b_next_states = torch.tensor(next_states[batch_idx], dtype=torch.long, device=device)
            b_dones = torch.tensor(dones[batch_idx], dtype=torch.float, device=device)

            # Q(s,a)
            q_values = online(b_states).gather(1, b_actions.unsqueeze(1)).squeeze(1)

            # target Q
            with torch.no_grad():
                max_next_q = target(b_next_states).max(1)[0]
                target_q = b_rewards + gamma * max_next_q * (1 - b_dones)

            # Loss + update
            loss = loss_fn(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss={total_loss:.4f}")

        # Update target network
        target.load_state_dict(online.state_dict())

    return online, target

from tqdm import tqdm

def train_dqn_fixed_tqdm(transitions, num_items_embedding, num_items_actions, hist_len=20, emb_dim=64, hidden=256,
                    gamma=0.99, lr=1e-3, batch_size=64, epochs=3, device="cpu"):

    states, actions, rewards, next_states, dones = transitions
    actions = actions - 1  # Convert back to 0-indexed for gather

    online = DQN_Fixed(num_items_embedding, num_items_actions, emb_dim, hidden).to(device)
    target = DQN_Fixed(num_items_embedding, num_items_actions, emb_dim, hidden).to(device)
    target.load_state_dict(online.state_dict())

    optimizer = optim.Adam(online.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    N = len(states)

    for epoch in tqdm(range(epochs), desc="Training Epochs"):
        idx = np.random.permutation(N)
        total_loss = 0.0

        for i in range(0, N, batch_size):
            batch_idx = idx[i:i+batch_size]

            b_states = torch.tensor(states[batch_idx], dtype=torch.long, device=device)
            b_actions = torch.tensor(actions[batch_idx], dtype=torch.long, device=device)
            b_rewards = torch.tensor(rewards[batch_idx], dtype=torch.float, device=device)
            b_next_states = torch.tensor(next_states[batch_idx], dtype=torch.long, device=device)
            b_dones = torch.tensor(dones[batch_idx], dtype=torch.float, device=device)

            q_values = online(b_states).gather(1, b_actions.unsqueeze(1)).squeeze(1)

            with torch.no_grad():
                max_next_q = target(b_next_states).max(1)[0]
                target_q = b_rewards + gamma * max_next_q * (1 - b_dones)

            loss = loss_fn(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        tqdm.write(f"Epoch {epoch+1}/{epochs}, Loss={total_loss:.4f}")
        target.load_state_dict(online.state_dict())

    return online, target


# Train the model
online, target = train_dqn_fixed_tqdm(
    trans_final, 
    num_items_embedding=num_items_embedding, 
    num_items_actions=num_items_actions,
    hist_len=20, 
    emb_dim=128, 
    hidden=2048, 
    epochs=3, 
    batch_size=64,
    device='cuda'
)

Action space size: 4996
Valid transitions: 436417
Mapping states...
Mapping next_states...
Final ranges:
States: 0-4996
Actions: 1-4996
Next states: 0-4996
Embedding size: 4997
Action space size: 4996


Training Epochs:  33%|███▎      | 1/3 [00:49<01:38, 49.07s/it]

Epoch 1/3, Loss=1355.9142


Training Epochs:  67%|██████▋   | 2/3 [01:36<00:48, 48.38s/it]

Epoch 2/3, Loss=2205.7069


Training Epochs: 100%|██████████| 3/3 [02:25<00:00, 48.46s/it]

Epoch 3/3, Loss=4657.2160





In [12]:
# Evaluation after training your model
print("Evaluating trained DQN model...")

# You need to create a modified ActionSpace class that works with your mapped data
class ActionSpaceEval:
    def __init__(self, candidate_skus: np.ndarray):
        self.skus = candidate_skus.astype('int64')
        self.sku2idx = {int(s): i for i, s in enumerate(self.skus)}
    
    def to_index(self, sku_array: np.ndarray) -> np.ndarray:
        # For evaluation, we need to handle the fact that our actions are already mapped
        # This assumes sku_array contains the original SKU IDs
        return np.array([self.sku2idx.get(int(s), -1) for s in sku_array], dtype=np.int64)

# Create action space for evaluation (using original candidate SKUs)
action_space_eval = ActionSpaceEval(candidates)

# Prepare evaluation data - use the same mapped transitions we used for training
# But we need to convert actions back to 0-indexed for evaluation
eval_transitions = (
    states_mapped,           # Mapped states (with padding)
    actions_final - 1,       # Convert back to 0-indexed actions for evaluation
    rewards_valid,           # Rewards
    next_states_mapped,      # Mapped next states
    dones_valid             # Done flags
)

# Evaluate at different K values
k_values = [20, 40, 60]

# Check what device your model is on and match it
model_device = next(online.parameters()).device
device = str(model_device)
print(f"Model is on device: {device}")

# Move model to CPU if needed (or keep on GPU)
if device.startswith('cuda') and not torch.cuda.is_available():
    print("CUDA not available, moving model to CPU...")
    online = online.cpu()
    device = 'cpu'
elif device == 'cpu' and torch.cuda.is_available():
    print("CUDA available, you can move model to GPU for faster evaluation...")
    # Uncomment next line if you want to use GPU
    # online = online.cuda()
    # device = 'cuda'

print(f"Evaluating on {len(eval_transitions[0])} test samples...")
print("-" * 50)

for k in k_values:
    hr, ndcg = eval_hr_ndcg(
        model=online,                    # Your trained model
        transitions=eval_transitions,    # Evaluation data
        action_space=action_space_eval,  # Action space mapping
        hist_len=20,                    # Same as training
        k=k,                            # Top-K for evaluation
        device=device
    )
    
    print(f"HR@{k:2d}:   {hr:.4f}")
    print(f"NDCG@{k:2d}: {ndcg:.4f}")
    print()

# Optional: More detailed analysis
print("Model Architecture Summary:")
print(f"- Embedding size: {num_items_embedding}")
print(f"- Action space size: {num_items_actions}")
print(f"- Total parameters: {sum(p.numel() for p in online.parameters()):,}")
print(f"- Device: {next(online.parameters()).device}")

# Optional: Check some predictions
print("\nSample Predictions:")
print("-" * 30)

# Take first few samples for inspection
sample_states = torch.tensor(states_mapped[:5], dtype=torch.long, device=device)
with torch.no_grad():
    sample_q = online(sample_states)
    top5_actions = torch.topk(sample_q, k=5, dim=1).indices
    
    for i in range(5):
        print(f"Sample {i+1}:")
        print(f"  Input state (last 5 items): {states_mapped[i][-5:]}")
        print(f"  True next action: {actions_final[i] - 1}")  # Convert back to 0-indexed
        print(f"  Top 5 predicted actions: {top5_actions[i].cpu().numpy()}")
        print()

# Optional: Action distribution analysis
print("Action Prediction Analysis:")
print("-" * 30)
with torch.no_grad():
    # Make sure tensors are on the same device as model
    analysis_states = torch.tensor(states_mapped[:1000], dtype=torch.long, device=device)
    all_q = online(analysis_states)
    top_actions = torch.topk(all_q, k=10, dim=1).indices
    
    # Count frequency of top predicted actions
    from collections import Counter
    action_counts = Counter(top_actions.flatten().cpu().numpy())
    
    print("Most frequently predicted actions (top 10):")
    for action_idx, count in action_counts.most_common(10):
        # Convert back to original SKU ID
        original_sku = candidates[action_idx] if action_idx < len(candidates) else "Unknown"
        print(f"  Action {action_idx} (SKU {original_sku}): {count} times")

Evaluating trained DQN model...
Model is on device: cuda:0
Evaluating on 436417 test samples...
--------------------------------------------------
HR@20:   0.0000
NDCG@20: 0.0000

HR@40:   0.0000
NDCG@40: 0.0000

HR@60:   0.0000
NDCG@60: 0.0000

Model Architecture Summary:
- Embedding size: 4997
- Action space size: 4996
- Total parameters: 15,336,964
- Device: cuda:0

Sample Predictions:
------------------------------
Sample 1:
  Input state (last 5 items): [   0    0    0    0 2892]
  True next action: 2891
  Top 5 predicted actions: [ 181  100  228 1232 2309]

Sample 2:
  Input state (last 5 items): [   0    0    0 2892 2892]
  True next action: 2891
  Top 5 predicted actions: [ 100  333 4888 3965  181]

Sample 3:
  Input state (last 5 items): [   0    0 2892 2892 2892]
  True next action: 2891
  Top 5 predicted actions: [4888 3965  333  100 1662]

Sample 4:
  Input state (last 5 items): [   0 2892 2892 2892 2892]
  True next action: 2891
  Top 5 predicted actions: [4888 3965  100 1

In [None]:
# 7) Evaluate
# hr, ndcg = eval_hr_ndcg(online, trans, action_space, hist_len=20, k=10)
# print(f"HR@10={hr:.4f}, NDCG@10={ndcg:.4f}")

In [None]:
# Diagnostic trio: GPU status, CPU forward sanity, guarded model->CUDA test
# Run this as one cell in your notebook.

import os, traceback, sys
import numpy as np
import torch

# Optional: If you want deterministic CUDA tracebacks, restart kernel and UNCOMMENT the next two lines
# BEFORE importing torch in a fresh kernel:
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print("===== ENV & TORCH INFO =====")
print("python:", sys.version.splitlines()[0])
print("torch.__version__:", getattr(torch, "__version__", None))
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    try:
        dev = torch.cuda.current_device()
        print("current device:", dev)
        try:
            print("device name:", torch.cuda.get_device_name(dev))
        except Exception as e:
            print("device name query failed:", e)
        try:
            print("allocated (bytes):", torch.cuda.memory_allocated(dev))
            print("reserved  (bytes):", torch.cuda.memory_reserved(dev))
        except Exception as e:
            print("memory query failed:", e)
    except Exception as e:
        print("cuda query failed:", e)
print("=============================\n")

# --- Parameters that must match your model shape ---
# Make sure these match how you constructed the network earlier.
# If you used different values, change them here before running.
num_items = len(action_space.skus) if 'action_space' in globals() else 500    # fallback guess
hist_len = 20
emb_dim = 64
hidden = 256

print("Using (num_items, hist_len, emb_dim, hidden) =", (num_items, hist_len, emb_dim, hidden))
print()

# --- Create the model on CPU and run a tiny forward pass ---
print("1) Creating model on CPU and running a tiny forward pass...")
try:
    # Import your model class if defined in another module; otherwise rely on DuelingDQN in this notebook.
    ModelClass = globals().get('DuelingDQN', None)
    if ModelClass is None:
        raise RuntimeError("DuelingDQN not found in globals() — define the class first or import it.")

    model = ModelClass(num_items, emb_dim, hist_len, hidden)   # CPU by default
    print("Model instance created successfully on CPU.")
    # Build a small dummy input batch (values must be valid indices for the model embedding).
    # If your embedding reserves pad index = num_items, allow values up to num_items.
    pad_limit = num_items  # allowed max index for states: either num_items (if pad reserved) or num_items-1
    dummy = np.zeros((4, hist_len), dtype=np.int64)            # small batch of zeros (padding)
    # optionally set some example indices within valid range:
    if pad_limit > 1:
        dummy[0, -1] = min(pad_limit - 1, 1)
        dummy[1, -2] = min(pad_limit - 1, 2)
    dummy_t = torch.from_numpy(dummy).long()
    out = model(dummy_t)
    print("Forward OK, q shape:", tuple(out.shape))
except Exception:
    print("Exception during CPU model creation / forward:")
    traceback.print_exc()
    raise

print("\n2) Attempting to move model to CUDA (if available) in guarded block...")
try:
    if torch.cuda.is_available():
        try:
            # sync & small free
            torch.cuda.synchronize()
        except Exception:
            pass

        try:
            model_cuda = model.to('cuda')
            # run a forward on GPU to ensure kernels launch
            dummy_gpu = dummy_t.to('cuda')
            out_gpu = model_cuda(dummy_gpu)
            print("Model moved to CUDA and forward passed. q shape (CUDA):", tuple(out_gpu.shape))
        except Exception as ex:
            print("Exception while moving model to CUDA or running forward on CUDA:")
            traceback.print_exc()
            # Helpful hints printed to the user
            print("\n--- HINTS ---")
            print("1) If you haven't, try restarting the kernel and setting:")
            print("   import os; os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # THEN import torch")
            print("2) If the above doesn't help, try running on CPU: set device='cpu' in your training call.")
            print("3) If model->cuda fails only after earlier CUDA errors, a kernel restart usually clears the CUDA context.")
            print("4) You may also have mismatched driver/CUDA versions or an out-of-memory condition.")
    else:
        print("CUDA not available; skipping model->CUDA test.")
except Exception:
    print("Unexpected exception in CUDA test:")
    traceback.print_exc()

print("\nAll diagnostics finished.")
