In [None]:
# Cell 1: Cài đặt (chạy 1 lần)
!pip install -q sentence-transformers wandb

In [None]:
# Cell 2: Imports, seed, device & W&B init
import os, random, math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
import wandb

# --- Seed để tái lập ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark     = False

# Thiết lập tokenizer parallelism để tránh warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device={device}")

wandb.finish()  
# W&B init
Name = "NgocMinh"
Model_name = "LightFM"
Version = "1.0.0"

wandb.login(key="62e3cf4c2815c959ed2609de1d55fa0504818c4a")

# 1.1. Khởi tạo W&B run
wandb.init(
    project="hybrid-neumf-llm-newds",
    name="HybridNeuMF_NewDS",
    config={
        "mf_dim": 32,
        "mlp_layers": [64, 32, 16, 8],
        "llm_model": "all-MiniLM-L6-v2",
        "llm_dim": 384,
        "batch_size": 1024,
        "lr": 1e-3,
        "weight_decay": 1e-4,
        "epochs": 10,
        "K": 10
    }
)
cfg = wandb.config


In [None]:
# Cell 3: Load & encode IDs
train_df = pd.read_csv("/kaggle/input/databang/train_ratings.csv")   # user_id,item_id,rating,...
test_df  = pd.read_csv("/kaggle/input/databang/test_ratings.csv")
meta_df  = pd.read_csv("/kaggle/input/databang/filtered_metadata (2).csv")

# Fit trên toàn bộ user_id từ train + test
user_enc = LabelEncoder()
user_enc.fit(pd.concat([train_df["user_id"], test_df["user_id"]], ignore_index=True))
train_df["uid"] = user_enc.transform(train_df["user_id"])
test_df["uid"]  = user_enc.transform(test_df["user_id"])

# Fit item_id từ train + test + metadata
item_enc = LabelEncoder()
item_enc.fit(pd.concat([train_df["item_id"], test_df["item_id"], meta_df["item_id"]], ignore_index=True))
train_df["iid"] = item_enc.transform(train_df["item_id"])
test_df["iid"]  = item_enc.transform(test_df["item_id"])
meta_df["iid"]  = item_enc.transform(meta_df["item_id"])

# Lấy đúng số lượng class (không dùng .nunique())
num_users = len(user_enc.classes_)
num_items = len(item_enc.classes_)

print(f"✅ Encoded: {num_users} users, {num_items} items")



In [None]:
# Cell 4: Prepare item text for LLM (concat description, features, categories)
meta_df["text_input"] = (
    meta_df["description"].fillna("") + " " +
    meta_df["features"].fillna("")    + " " +
    meta_df["categories"].fillna("")
)

# Reindex metadata so that row i corresponds to iid = i
meta_df = meta_df.set_index("iid").reindex(range(num_items)).fillna("")


In [None]:
# === Cell 4.5: Tạo embedding cho toàn bộ user từ review_text ===

# B1: Gộp review text theo user_id (chỉ user có review)
user_texts_partial = (
    train_df.groupby("user_id")["text"]
    .apply(lambda x: " ".join(x.dropna().astype(str)))
    .reset_index()
)

# B2: Gắn uid theo LabelEncoder
user_texts_partial["uid"] = user_enc.transform(user_texts_partial["user_id"])

# B3: Tạo mảng văn bản với số lượng = num_users, default = ""
user_text_array = [""] * num_users
user_text_dict = dict(zip(user_texts_partial["uid"], user_texts_partial["text"]))

for uid in range(num_users):
    user_text_array[uid] = user_text_dict.get(uid, "")  # fallback nếu không có review

# B4: Encode bằng SentenceTransformer
user_llm_np = llm.encode(user_text_array, batch_size=64, show_progress_bar=True)
user_llm_emb = torch.tensor(user_llm_np, dtype=torch.float32).to(device)

# B5: Kiểm tra shape
assert user_llm_emb.shape == (num_users, cfg.llm_dim)
print("✅ user_llm_emb shape:", user_llm_emb.shape)


In [None]:
# === Cell 4.6: Lưu lại các embedding đã tính sau Cell 4.5 ===

save_dir = "/kaggle/working"  # Hoặc "./" nếu chạy local notebook

torch.save(item_llm_emb.cpu(), f"{save_dir}/item_llm_emb.pt")
torch.save(user_llm_emb.cpu(), f"{save_dir}/user_llm_emb.pt")

print("✅ Đã lưu: item_llm_emb.pt và user_llm_emb.pt")


In [None]:
# === Load lại các embedding đã lưu (để tránh tính lại mỗi lần) ===

item_llm_emb = torch.load(f"{save_dir}/item_llm_emb.pt").to(device)
user_llm_emb = torch.load(f"{save_dir}/user_llm_emb.pt").to(device)

print("✅ Đã load embedding vào RAM & đưa lên GPU")


In [None]:


# Khởi tạo mô hình LLM (không truyền device để tránh lỗi CUDA)
llm = SentenceTransformer(cfg.llm_model)

# Chuẩn bị input văn bản cho từng item
item_texts = meta_df["text_input"].tolist()  # length == num_items

# Encode bằng LLM (trả về numpy), dùng GPU mặc định nếu có
item_emb_np = llm.encode(
    item_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True  # trả kết quả là numpy array
)

# Đảm bảo đúng shape đầu ra
assert item_emb_np.shape == (num_items, cfg.llm_dim)

# Chuyển sang torch tensor & lên GPU (nếu dùng)
item_llm_emb = torch.tensor(item_emb_np, dtype=torch.float32).to(device)


In [None]:
# === Cell 6: Dataset explicit rating (thêm user embedding) ===
class RatingDataset(Dataset):
    def __init__(self, df, item_emb, user_emb):
        self.uids = df["uid"].values
        self.iids = df["iid"].values
        self.ratings = df["rating"].values.astype(np.float32)
        self.item_emb = item_emb.cpu()
        self.user_emb = user_emb.cpu()

    def __len__(self): return len(self.uids)

    def __getitem__(self, idx):
        u = torch.LongTensor([self.uids[idx]])
        i = torch.LongTensor([self.iids[idx]])
        r = torch.FloatTensor([self.ratings[idx]])
        ll_item = self.item_emb[self.iids[idx]].unsqueeze(0)
        ll_user = self.user_emb[self.uids[idx]].unsqueeze(0)
        return u, i, ll_user, ll_item, r

# Khởi tạo loader như thường lệ
train_ds = RatingDataset(train_df, item_llm_emb, user_llm_emb)
test_ds  = RatingDataset(test_df,  item_llm_emb, user_llm_emb)

train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
test_loader  = DataLoader(test_ds,  batch_size=cfg.batch_size, shuffle=False, num_workers=0)


In [None]:
# === Cell 7: HybridNeuMF mở rộng với ll_user + ll_item ===
class HybridNeuMF(nn.Module):
    def __init__(self, n_users, n_items, mf_dim, mlp_layers, llm_dim):
        super().__init__()
        self.user_mf = nn.Embedding(n_users, mf_dim)
        self.item_mf = nn.Embedding(n_items, mf_dim)
        self.user_mlp = nn.Embedding(n_users, mlp_layers[0] // 2)
        self.item_mlp = nn.Embedding(n_items, mlp_layers[0] // 2)

        blocks = []
        for d_in, d_out in zip(mlp_layers[:-1], mlp_layers[1:]):
            blocks += [nn.Dropout(0.2), nn.Linear(d_in, d_out), nn.ReLU()]
        self.mlp = nn.Sequential(*blocks)

        fusion_dim = mf_dim + mlp_layers[-1] + 2 * llm_dim
        self.predict = nn.Sequential(
            nn.Linear(fusion_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, u, i, ll_user, ll_item):
        mu = self.user_mf(u).squeeze(1)
        mi = self.item_mf(i).squeeze(1)
        cf_vec = mu * mi

        xu = self.user_mlp(u).squeeze(1)
        xi = self.item_mlp(i).squeeze(1)
        mlp_vec = self.mlp(torch.cat([xu, xi], dim=1))

        x = torch.cat([cf_vec, mlp_vec, ll_user.squeeze(1), ll_item.squeeze(1)], dim=1)
        return self.predict(x)


In [None]:
# === Cell 7.5: Khởi tạo model, optimizer (có regularization), loss ===

model = HybridNeuMF(
    n_users=num_users,
    n_items=num_items,
    mf_dim=cfg.mf_dim,
    mlp_layers=cfg.mlp_layers,
    llm_dim=cfg.llm_dim
).to(device)

# ✅ Thêm regularization: weight_decay > 0
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=cfg.lr,
    weight_decay=cfg.weight_decay  # ví dụ: 1e-4 để L2 regularization
)

criterion = nn.MSELoss()


In [None]:
# Cell 8: Metrics (Precision/Recall/NDCG/MAP/MRR + RMSE real)
import numpy as np

def precision_at_k(ranked, truth, k):
    return len(set(ranked[:k]) & set(truth))/k

def recall_at_k(ranked, truth, k):
    return len(set(ranked[:k]) & set(truth))/len(truth) if len(truth)>0 else 0.0

def ndcg_at_k(ranked, truth, k):
    dcg = sum(1/math.log2(idx+2) for idx,it in enumerate(ranked[:k]) if it in truth)
    idcg= sum(1/math.log2(i+2) for i in range(min(len(truth),k)))
    return dcg/idcg if idcg>0 else 0.0

def map_at_k(ranked, truth, k):
    hits,s=0,0.0
    for idx,it in enumerate(ranked[:k]):
        if it in truth:
            hits+=1; s+=hits/(idx+1)
    return s/len(truth) if len(truth)>0 else 0.0

def mrr_at_k(ranked, truth, k):
    for idx,it in enumerate(ranked[:k]):
        if it in truth: return 1/(idx+1)
    return 0.0

def rmse_real(model, loader):
    model.eval()
    se, n = 0.0, 0
    with torch.no_grad():
        for u, i, ll_user, ll_item, r in loader:
            u, i = u.to(device), i.to(device)
            ll_user, ll_item, r = ll_user.to(device), ll_item.to(device), r.to(device)

            pred = model(u, i, ll_user, ll_item).view(-1)
            se += ((pred - r.view(-1))**2).sum().item()
            n  += r.numel()
    return math.sqrt(se / n)

@torch.no_grad()
def evaluate_full(model, train_df, test_df, K, loader):
    model.eval()
    train_map = train_df.groupby("uid")["iid"].apply(set).to_dict()
    test_map  = test_df.groupby("uid")["iid"].apply(list).to_dict()

    P, R, N, AP, MRR = [], [], [], [], []

    for u, truth in test_map.items():
        if not truth:
            continue

        users = torch.LongTensor([u] * num_items).to(device)
        items = torch.arange(num_items).to(device)
        ll_item = item_llm_emb.to(device)
        ll_user = user_llm_emb[u].unsqueeze(0).repeat(num_items, 1).to(device)

        scores = model(users, items, ll_user.unsqueeze(1), ll_item.unsqueeze(1)).view(-1).cpu().numpy()

        for it in train_map.get(u, []):
            scores[it] = -np.inf

        ranked = np.argsort(-scores)
        P.append(precision_at_k(ranked, truth, K))
        R.append(recall_at_k(ranked, truth, K))
        N.append(ndcg_at_k(ranked, truth, K))
        AP.append(map_at_k(ranked, truth, K))
        MRR.append(mrr_at_k(ranked, truth, K))

    return {
        "Precision@K": np.mean(P),
        "Recall@K":    np.mean(R),
        "NDCG@K":      np.mean(N),
        "MAP@K":       np.mean(AP),
        "MRR@K":       np.mean(MRR),
        "RMSE":        rmse_real(model, loader)
    }



In [None]:
print("max uid in train_df =", train_df["uid"].max())
print("num_users in model  =", model.user_mlp.num_embeddings)

print("max iid in train_df =", train_df["iid"].max())
print("num_items in model  =", model.item_mlp.num_embeddings)


In [None]:
# === Cell 9: Training loop sử dụng ll_user + ll_item ===
results = []
for ep in range(1, cfg.epochs + 1):
    model.train()
    total_loss = 0.0

    for u, i, ll_user, ll_item, r in tqdm(train_loader, desc=f"Epoch {ep}"):
        u, i = u.to(device), i.to(device)
        ll_user, ll_item, r = ll_user.to(device), ll_item.to(device), r.to(device)

        optimizer.zero_grad()
        pred = model(u, i, ll_user, ll_item).view(-1)
        loss = criterion(pred, r.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * r.size(0)

    train_loss = total_loss / len(train_ds)
    metrics = evaluate_full(model, train_df, test_df, cfg.K, test_loader)
    metrics["Train Loss"] = train_loss
    metrics["epoch"] = ep

    wandb.log(metrics)
    results.append(metrics)

    print(f"Epoch {ep} — " + "  ".join(f"{k}={v:.4f}" for k, v in metrics.items() if k != "epoch"))
    

In [None]:
# Cell 10: Show & save
df_res = pd.DataFrame(results).set_index("epoch")
display(df_res)
torch.save(model.state_dict(), "/kaggle/working/hybrid_neumf_llm_newds.pth")
df_res.to_csv("/kaggle/working/hybrid_neumf_llm_newds_metrics.csv")
print("✅ Done.")


In [None]:
# === Cell 6-N: Dataset implicit + negative sampling ===
class ImplicitDataset(Dataset):
    def __init__(self, df_pos, num_items, item_emb, user_emb, num_neg=4):
        self.pos = df_pos[["uid", "iid"]].values
        self.user_pos_dict = df_pos.groupby("uid")["iid"].apply(set).to_dict()
        self.num_items = num_items
        self.num_neg = num_neg
        self.item_emb = item_emb.to(device)
        self.user_emb = user_emb.to(device)

    def __len__(self): return len(self.pos)

    def __getitem__(self, idx):
        u, i_pos = self.pos[idx]
        triplets = [(u, i_pos, 1.0)]
        neg = 0
        while neg < self.num_neg:
            i_neg = np.random.randint(0, self.num_items)
            if i_neg not in self.user_pos_dict.get(u, set()):
                triplets.append((u, i_neg, 0.0))
                neg += 1
        out = []
        for (uu, ii, label) in triplets:
            out.append((
                torch.tensor([uu]), torch.tensor([ii]),
                self.user_emb[uu].unsqueeze(0),
                self.item_emb[ii].unsqueeze(0),
                torch.tensor([label], dtype=torch.float32)
            ))
        return out

def collate_triplets(batch):
    u, i, lu, li, l = zip(*[t for sub in batch for t in sub])
    return torch.cat(u), torch.cat(i), torch.cat(lu), torch.cat(li), torch.cat(l)

train_ds = ImplicitDataset(train_df, num_items, item_llm_emb, user_llm_emb, num_neg=4)
train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True,
                          collate_fn=collate_triplets, num_workers=0)

In [None]:
# === Cell 8: Metric (precision@K, recall@K, ndcg@K...) ===
def precision_at_k(ranked, truth, k): return len(set(ranked[:k]) & set(truth)) / k

def recall_at_k(ranked, truth, k): return len(set(ranked[:k]) & set(truth)) / len(truth) if truth else 0.0

def ndcg_at_k(ranked, truth, k):
    dcg = sum(1 / math.log2(idx + 2) for idx, it in enumerate(ranked[:k]) if it in truth)
    idcg = sum(1 / math.log2(i + 2) for i in range(min(len(truth), k)))
    return dcg / idcg if idcg > 0 else 0.0

def map_at_k(ranked, truth, k):
    hits, s = 0, 0.0
    for idx, it in enumerate(ranked[:k]):
        if it in truth:
            hits += 1
            s += hits / (idx + 1)
    return s / len(truth) if truth else 0.0

def mrr_at_k(ranked, truth, k):
    for idx, it in enumerate(ranked[:k]):
        if it in truth: return 1 / (idx + 1)
    return 0.0

@torch.no_grad()
def evaluate_full(model, train_df, test_df, K, loader):
    model.eval()
    train_map = train_df.groupby("uid")["iid"].apply(set).to_dict()
    test_map = test_df.groupby("uid")["iid"].apply(list).to_dict()
    P, R, N, AP, MRR = [], [], [], [], []
    for u, truth in test_map.items():
        if not truth: continue
        users = torch.LongTensor([u] * num_items).to(device)
        items = torch.arange(num_items).to(device)
        ll_item = item_llm_emb.to(device)
        ll_user = user_llm_emb[u].unsqueeze(0).repeat(num_items, 1).to(device)
        scores = model(users, items, ll_user.unsqueeze(1), ll_item.unsqueeze(1)).view(-1).cpu().numpy()
        for it in train_map.get(u, []):
            scores[it] = -np.inf
        ranked = np.argsort(-scores)
        P.append(precision_at_k(ranked, truth, K))
        R.append(recall_at_k(ranked, truth, K))
        N.append(ndcg_at_k(ranked, truth, K))
        AP.append(map_at_k(ranked, truth, K))
        MRR.append(mrr_at_k(ranked, truth, K))
    return {
        "Precision@K": np.mean(P),
        "Recall@K": np.mean(R),
        "NDCG@K": np.mean(N),
        "MAP@K": np.mean(AP),
        "MRR@K": np.mean(MRR),
    }

In [None]:
@torch.no_grad()
def compute_rmse(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    for u, i, ll_user, ll_item, r in data_loader:
        u, i, ll_user, ll_item, r = u.to(device), i.to(device), ll_user.to(device), ll_item.to(device), r.to(device)
        logits = model(u, i, ll_user, ll_item).view(-1)
        probs = torch.sigmoid(logits)
        all_preds.append(probs.cpu())
        all_labels.append(r.cpu())
    pred = torch.cat(all_preds)
    true = torch.cat(all_labels)
    rmse = torch.sqrt(torch.mean((pred - true) ** 2)).item()
    return rmse


In [None]:
# === Cell 9: Training loop with early stopping ===
best_ndcg, patience, counter = 0.0, 3, 0
results = []

for ep in range(1, cfg.epochs + 1):
    model.train()
    total_loss = 0.0

    for u, i, ll_user, ll_item, r in tqdm(train_loader, desc=f"Epoch {ep}"):
        u, i, ll_user, ll_item, r = u.to(device), i.to(device), ll_user.to(device), ll_item.to(device), r.to(device)
        optimizer.zero_grad()
        pred = model(u, i, ll_user, ll_item).view(-1)
        loss = criterion(pred, r)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * r.size(0)

    train_loss = total_loss / len(train_loader.dataset)
    metrics = evaluate_full(model, train_df, test_df, cfg.K, None)
    rmse = compute_rmse(model, train_loader)
    metrics["RMSE"] = rmse

    metrics["Train Loss"] = train_loss
    metrics["epoch"] = ep
    wandb.log(metrics)
    results.append(metrics)

    print(f"Epoch {ep} — " + "  ".join(f"{k}={v:.4f}" for k, v in metrics.items() if k != "epoch"))

    ndcg = metrics["NDCG@K"]
    if ndcg > best_ndcg:
        best_ndcg = ndcg
        counter = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        counter += 1
        if counter >= patience:
            print(f"\n🛑 Early stopping at epoch {ep} — best NDCG@K = {best_ndcg:.4f}")
            break


In [None]:
# === Compute RMSE on train and test sets (fixed version) ===
@torch.no_grad()
def compute_rmse(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    for u, i, ll_user, ll_item, r in data_loader:
        u, i, ll_user, ll_item, r = u.to(device), i.to(device), ll_user.to(device), ll_item.to(device), r.to(device)
        logits = model(u, i, ll_user, ll_item).view(-1)
        probs = torch.sigmoid(logits)
        all_preds.append(probs.cpu())
        all_labels.append(r.cpu())
    pred = torch.cat(all_preds)
    true = torch.cat(all_labels)
    rmse = torch.sqrt(torch.mean((pred - true) ** 2)).item()
    return rmse

# Dataset cho test (dành cho explicit feedback)
class ExplicitDataset(Dataset):
    def __init__(self, df, user_emb, item_emb):
        self.pairs = df[["uid", "iid"]].values
        self.labels = np.ones(len(df))  # vì chỉ có positive interactions
        self.user_emb = user_emb.to(device)
        self.item_emb = item_emb.to(device)

    def __len__(self): return len(self.pairs)

    def __getitem__(self, idx):
        u, i = self.pairs[idx]
        return (
            torch.tensor([u]), torch.tensor([i]),
            self.user_emb[u].unsqueeze(0),
            self.item_emb[i].unsqueeze(0),
            torch.tensor([self.labels[idx]], dtype=torch.float32)
        )

# Collate function riêng cho explicit dataset
def collate_explicit(batch):
    u, i, lu, li, l = zip(*batch)
    return torch.cat(u), torch.cat(i), torch.cat(lu), torch.cat(li), torch.cat(l)

# Load best model từ file
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

# Tạo DataLoader cho test
test_ds = ExplicitDataset(test_df, user_llm_emb, item_llm_emb)
test_loader = DataLoader(test_ds, batch_size=cfg.batch_size, shuffle=False, collate_fn=collate_explicit)

# Tính RMSE cho train và test
rmse_train = compute_rmse(model, train_loader)
rmse_test  = compute_rmse(model, test_loader)

# In kết quả
print(f"✅ RMSE on train set: {rmse_train:.4f}")
print(f"✅ RMSE on test set : {rmse_test:.4f}")


In [None]:
import torch
@torch.no_grad()
def compute_hr_at_k(model, train_df, test_df, item_llm_emb, user_llm_emb, K=10):
    model.eval()
    num_items = item_llm_emb.shape[0]
    train_map = train_df.groupby("uid")["iid"].apply(set).to_dict()
    test_map = test_df.groupby("uid")["iid"].apply(list).to_dict()
    HR = []

    for u, truth in test_map.items():
        if not truth:
            continue

        users = torch.LongTensor([u] * num_items).to(device)
        items = torch.arange(num_items).to(device)
        ll_item = item_llm_emb.to(device)
        ll_user = user_llm_emb[u].unsqueeze(0).repeat(num_items, 1).to(device)
        scores = model(users, items, ll_user.unsqueeze(1), ll_item.unsqueeze(1)).view(-1).cpu().numpy()

        for it in train_map.get(u, []):  # exclude training items
            scores[it] = -np.inf

        ranked = np.argsort(-scores)[:K]
        hit = any(i in ranked for i in truth)
        HR.append(int(hit))

    return np.mean(HR)

# Gọi hàm này sau khi load mô hình tốt nhất
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

hr10 = compute_hr_at_k(model, train_df, test_df, item_llm_emb, user_llm_emb, K=10)
print(f"✅ HR@10: {hr10:.4f}")
