In [208]:
import math, random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

# -----------------------------
# Device
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [209]:
ratings = pd.read_csv("data/ratings.csv").drop(columns=["timestamp"])
ratings["label"] = (ratings["rating"] >= 4.0).astype(int)

uids = ratings["userId"].unique()
iids = ratings["movieId"].unique()
uid2idx = {u:i for i,u in enumerate(uids)}
iid2idx = {m:i for i,m in enumerate(iids)}
ratings["userId"] = ratings["userId"].map(uid2idx)
ratings["movieId"] = ratings["movieId"].map(iid2idx)

n_users = ratings["userId"].nunique()
n_items = ratings["movieId"].nunique()

In [210]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,label
0,0,0,4.0,1
1,0,1,4.0,1
2,0,2,4.0,1
3,0,3,5.0,1
4,0,4,5.0,1


In [211]:
# one positive per user in test
pos = ratings[ratings.label == 1]
test_idx = pos.groupby("userId", group_keys=False).apply(lambda x: x.sample(1, random_state=42)).index
test_df = ratings.loc[test_idx][["userId","movieId","label"]]
train_df = ratings.drop(test_idx)

train_ui = set(zip(train_df.userId.tolist(), train_df.movieId.tolist()))

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 100227, Test size: 609


  test_idx = pos.groupby("userId", group_keys=False).apply(lambda x: x.sample(1, random_state=42)).index


In [212]:
# -----------------------------
# 2) GMF model (implicit)
# -----------------------------
class GMF_Implicit(nn.Module):
    def __init__(self, n_users, n_items, k=32):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, k)
        self.item_emb = nn.Embedding(n_items, k)
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)

        # linear layer on top of elementwise product
        self.fc = nn.Linear(k, 1)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, u, i):
        z = self.user_emb(u) * self.item_emb(i)  # elementwise product
        return self.fc(z).view(-1)  # logits (before sigmoid)
        

In [213]:
# 3) Negative sampling (same as MF)
# -----------------------------
def sample_train_batch(train_df, n_items, train_ui, num_neg=4, users_subset=None):
    if users_subset is None:
        users = train_df[train_df.label==1]["userId"].unique().tolist()
    else:
        users = users_subset
    U,I,Y = [],[],[]
    for u in users:
        pos_items = train_df[(train_df.userId==u) & (train_df.label==1)].movieId.tolist()
        if not pos_items: continue
        ip = random.choice(pos_items)
        U.append(u); I.append(ip); Y.append(1.0)
        negs=0
        while negs<num_neg:
            j = random.randint(0, n_items-1)
            if (u,j) not in train_ui and j!=ip:
                U.append(u); I.append(j); Y.append(0.0)
                negs+=1
    return torch.LongTensor(U), torch.LongTensor(I), torch.FloatTensor(Y)

In [214]:
# -----------------------------
# 4) Evaluation (HR@K, NDCG@K)
# -----------------------------
def eval_hr_ndcg(model, test_df, train_ui, n_items, k=10, n_neg=100, device="cpu"):
    model.eval(); HR=0.0; N=0.0; cnt=0
    rng = np.random.default_rng(123)
    with torch.no_grad():
        for _, row in test_df.iterrows():
            u, ip = int(row.userId), int(row.movieId)
            seen = {i for (uu,i) in train_ui if uu==u}
            negs = []
            while len(negs)<n_neg:
                j = int(rng.integers(0,n_items))
                if j!=ip and j not in seen: negs.append(j)
            cands = [ip]+negs
            U = torch.LongTensor([u]*(1+n_neg)).to(device)
            I = torch.LongTensor(cands).to(device)
            scores = model(U,I).cpu().numpy()
            topk_idx = scores.argsort()[-k:][::-1]
            topk = [cands[t] for t in topk_idx]
            hit = 1.0 if ip in topk else 0.0; HR+=hit
            if hit:
                rank = topk.index(ip)+1
                N+=1.0/math.log2(rank+1)
            cnt+=1
    return HR/cnt, N/cnt

In [215]:
# -----------------------------
# 5) Train GMF implicit
# -----------------------------
model = GMF_Implicit(n_users, n_items, k=32).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=8e-4, weight_decay=1e-6)

epochs = 40
for e in range(1, epochs+1):
    model.train()
    U,I,Y = sample_train_batch(train_df, n_items, train_ui, num_neg=4)
    U,I,Y = U.to(device), I.to(device), Y.to(device)
    optimizer.zero_grad()
    loss = criterion(model(U,I), Y); loss.backward(); optimizer.step()

    if e==1 or e%2==0:
        hr, ndcg = eval_hr_ndcg(model, test_df, train_ui, n_items, k=10, n_neg=100, device=device)
        print(f"Epoch {e:02d} | Loss: {loss.item():.4f} | HR@10: {hr:.4f} | NDCG@10: {ndcg:.4f}")


Epoch 01 | Loss: 0.6931 | HR@10: 0.0936 | NDCG@10: 0.0462
Epoch 02 | Loss: 0.6929 | HR@10: 0.0969 | NDCG@10: 0.0478
Epoch 04 | Loss: 0.6924 | HR@10: 0.1002 | NDCG@10: 0.0490
Epoch 06 | Loss: 0.6919 | HR@10: 0.1034 | NDCG@10: 0.0497
Epoch 08 | Loss: 0.6915 | HR@10: 0.1051 | NDCG@10: 0.0521
Epoch 10 | Loss: 0.6910 | HR@10: 0.1051 | NDCG@10: 0.0520
Epoch 12 | Loss: 0.6905 | HR@10: 0.1100 | NDCG@10: 0.0531
Epoch 14 | Loss: 0.6900 | HR@10: 0.1166 | NDCG@10: 0.0556
Epoch 16 | Loss: 0.6896 | HR@10: 0.1166 | NDCG@10: 0.0561
Epoch 18 | Loss: 0.6891 | HR@10: 0.1232 | NDCG@10: 0.0595
Epoch 20 | Loss: 0.6886 | HR@10: 0.1314 | NDCG@10: 0.0626
Epoch 22 | Loss: 0.6881 | HR@10: 0.1314 | NDCG@10: 0.0631
Epoch 24 | Loss: 0.6877 | HR@10: 0.1379 | NDCG@10: 0.0682
Epoch 26 | Loss: 0.6872 | HR@10: 0.1544 | NDCG@10: 0.0743
Epoch 28 | Loss: 0.6867 | HR@10: 0.1511 | NDCG@10: 0.0779
Epoch 30 | Loss: 0.6863 | HR@10: 0.1511 | NDCG@10: 0.0803
Epoch 32 | Loss: 0.6858 | HR@10: 0.1626 | NDCG@10: 0.0854
Epoch 34 | Los