In [21]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import random
import math

In [22]:
# IMPLICIT FEEBACK
ratings = pd.read_csv("data/ratings.csv").drop(columns=["timestamp"])
ratings["label"] = (ratings["rating"] >= 4.0).astype(int)

In [23]:
ratings

Unnamed: 0,userId,movieId,rating,label
0,1,1,4.0,1
1,1,3,4.0,1
2,1,6,4.0,1
3,1,47,5.0,1
4,1,50,5.0,1
...,...,...,...,...
100831,610,166534,4.0,1
100832,610,168248,5.0,1
100833,610,168250,5.0,1
100834,610,168252,5.0,1


In [24]:

uids = ratings["userId"].unique()
iids = ratings["movieId"].unique()

uid2idx = {u:i for i,u in enumerate(uids)}
iid2idx = {m:i for i,m in enumerate(iids)}

ratings["userId"] = ratings["userId"].map(uid2idx)
ratings["movieId"] = ratings["movieId"].map(iid2idx)

n_users = ratings["userId"].nunique()
n_items = ratings["movieId"].nunique()

In [25]:
# one positive per user in test (only users with >=1 positive)
pos = ratings[ratings.label == 1]
test_idx = pos.groupby("userId", group_keys=False).apply(lambda x: x.sample(1, random_state=42)).index
test_df = ratings.loc[test_idx][["userId","movieId","label"]]
train_df = ratings.drop(test_idx)
train_ui = set(zip(train_df.userId.tolist(), train_df.movieId.tolist()))


  test_idx = pos.groupby("userId", group_keys=False).apply(lambda x: x.sample(1, random_state=42)).index


In [26]:
# ---- Model ----
class NeuMF(nn.Module):
    def __init__(self, n_users, n_items, k_gmf=32, k_mlp=32, mlp_layers=(64,32,16)):
        super().__init__()
        self.ug = nn.Embedding(n_users, k_gmf); nn.init.normal_(self.ug.weight, std=0.01)
        self.ig = nn.Embedding(n_items, k_gmf); nn.init.normal_(self.ig.weight, std=0.01)
        self.um = nn.Embedding(n_users, k_mlp); nn.init.normal_(self.um.weight, std=0.01)
        self.im = nn.Embedding(n_items, k_mlp); nn.init.normal_(self.im.weight, std=0.01)
        layers = []; d = 2*k_mlp
        for h in mlp_layers:
            layers += [nn.Linear(d, h), nn.ReLU()]; d = h
        self.mlp = nn.Sequential(*layers)
        self.fc = nn.Linear(k_gmf + mlp_layers[-1], 1)
        nn.init.xavier_uniform_(self.fc.weight); nn.init.zeros_(self.fc.bias)
    def forward(self, u, i):
        g = self.ug(u) * self.ig(i)
        m = self.mlp(torch.cat([self.um(u), self.im(i)], dim=1))
        return self.fc(torch.cat([g, m], dim=1)).view(-1)

In [27]:
# ---- Train sampling ----
def sample_train_batch(train_df, n_items, train_ui, num_neg=4, users_subset=None):
    if users_subset is None:
        users = train_df[train_df.label==1]["userId"].unique().tolist()
    else:
        users = users_subset
    U, I, Y = [], [], []
    for u in users:
        pos_items = train_df[(train_df.userId==u) & (train_df.label==1)].movieId.tolist()
        if not pos_items: continue
        ip = random.choice(pos_items)
        U.append(u); I.append(ip); Y.append(1.0)
        negs = 0
        while negs < num_neg:
            j = random.randint(0, n_items-1)
            if (u, j) not in train_ui and j != ip:
                U.append(u); I.append(j); Y.append(0.0); negs += 1
    return torch.LongTensor(U), torch.LongTensor(I), torch.FloatTensor(Y)

In [28]:
# ---- Eval (sampled negatives) ----
def eval_hr_ndcg(model, test_df, train_ui, n_items, k=10, n_neg=100, device="cpu"):
    model.eval(); HR=0.0; N=0.0; cnt=0
    rng = np.random.default_rng(123)
    with torch.no_grad():
        for _, row in test_df.iterrows():
            u, ip = int(row.userId), int(row.movieId)
            seen = {i for (uu,i) in train_ui if uu==u}
            negs = []
            while len(negs) < n_neg:
                j = int(rng.integers(0, n_items))
                if j != ip and j not in seen: negs.append(j)
            cands = [ip] + negs
            U = torch.LongTensor([u]*(1+n_neg)).to(device)
            I = torch.LongTensor(cands).to(device)
            scores = model(U, I).cpu().numpy()
            topk_idx = scores.argsort()[-k:][::-1]
            topk = [cands[t] for t in topk_idx]
            hit = 1.0 if ip in topk else 0.0; HR += hit
            if hit:
                rank = topk.index(ip) + 1
                N += 1.0 / math.log2(rank+1)
            cnt += 1
    return HR/cnt, N/cnt

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuMF(n_users, n_items, k_gmf=32, k_mlp=32, mlp_layers=(64,32,16)).to(device)
opt = torch.optim.Adam(model.parameters(), lr=8e-4, weight_decay=1e-6)
crit = nn.BCEWithLogitsLoss()

epochs = 30
for e in range(1, epochs+1):
    model.train()
    # (optional) sample a subset of users each epoch for speed on larger data
    batch_users = None
    U,I,Y = sample_train_batch(train_df, n_items, train_ui, num_neg=4, users_subset=batch_users)
    U,I,Y = U.to(device), I.to(device), Y.to(device)
    opt.zero_grad()
    loss = crit(model(U,I), Y); loss.backward(); opt.step()

    if e==1 or e%2==0:
        hr, ndcg = eval_hr_ndcg(model, test_df, train_ui, n_items, k=10, n_neg=100, device=device)
        print(f"Epoch {e:02d} | BCE: {loss.item():.4f} | HR@10: {hr:.4f} | NDCG@10: {ndcg:.4f}")


Epoch 01 | BCE: 0.6807 | HR@10: 0.1626 | NDCG@10: 0.0814
Epoch 02 | BCE: 0.6795 | HR@10: 0.2053 | NDCG@10: 0.1072
Epoch 04 | BCE: 0.6770 | HR@10: 0.3350 | NDCG@10: 0.1911
Epoch 06 | BCE: 0.6745 | HR@10: 0.4647 | NDCG@10: 0.2788
Epoch 08 | BCE: 0.6720 | HR@10: 0.5255 | NDCG@10: 0.3406
Epoch 10 | BCE: 0.6695 | HR@10: 0.5764 | NDCG@10: 0.3862
Epoch 12 | BCE: 0.6670 | HR@10: 0.6010 | NDCG@10: 0.4069
Epoch 14 | BCE: 0.6644 | HR@10: 0.6125 | NDCG@10: 0.4190
Epoch 16 | BCE: 0.6617 | HR@10: 0.6338 | NDCG@10: 0.4320
Epoch 18 | BCE: 0.6590 | HR@10: 0.6486 | NDCG@10: 0.4445
Epoch 20 | BCE: 0.6562 | HR@10: 0.6585 | NDCG@10: 0.4514
Epoch 22 | BCE: 0.6533 | HR@10: 0.6798 | NDCG@10: 0.4598
Epoch 24 | BCE: 0.6505 | HR@10: 0.6897 | NDCG@10: 0.4659
Epoch 26 | BCE: 0.6473 | HR@10: 0.6979 | NDCG@10: 0.4722
Epoch 28 | BCE: 0.6442 | HR@10: 0.7011 | NDCG@10: 0.4724
Epoch 30 | BCE: 0.6409 | HR@10: 0.7094 | NDCG@10: 0.4787
