# 05 — Neural CF Template
Starter PyTorch template for teammate 2. Provides dataset/dataloader, simple MLP model, and evaluation hook.


In [None]:
from typing import List

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from src import config
from src.evaluation import build_ground_truth, evaluate_topk

USER_COL = config.USER_COL
ITEM_COL = config.ITEM_COL

processed_dir = config.PROCESSED_DATA_DIR
train_df = pd.read_parquet(processed_dir / "train_interactions.parquet")
test_df = pd.read_parquet(processed_dir / "test_interactions.parquet")


## Build integer ID mappings


In [None]:
users = train_df[USER_COL].unique()
items = train_df[ITEM_COL].unique()
user_to_idx = {u: i for i, u in enumerate(users)}
item_to_idx = {it: i for i, it in enumerate(items)}
idx_to_item = {i: it for it, i in item_to_idx.items()}

def encode_df(df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame({
        "user_idx": df[USER_COL].map(user_to_idx),
        "item_idx": df[ITEM_COL].map(item_to_idx),
    })

train_enc = encode_df(train_df)
test_enc = encode_df(test_df)


## Dataset and DataLoader with negative sampling


In [None]:
class InteractionDataset(Dataset):
    def __init__(self, interactions: pd.DataFrame, num_items: int, num_neg: int = 4):
        self.interactions = interactions
        self.num_items = num_items
        self.num_neg = num_neg
        self.positive_pairs = list(zip(interactions["user_idx"], interactions["item_idx"]))

    def __len__(self):
        return len(self.positive_pairs)

    def __getitem__(self, idx):
        user, item = self.positive_pairs[idx]
        users = [user]
        items = [item]
        labels = [1.0]
        rng = np.random.default_rng(idx)
        for _ in range(self.num_neg):
            neg_item = rng.integers(0, self.num_items)
            users.append(user)
            items.append(neg_item)
            labels.append(0.0)
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels, dtype=torch.float32)


## Model


In [None]:
class NeuralCF(nn.Module):
    def __init__(self, num_users: int, num_items: int, embedding_dim: int = 64):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.mlp = nn.Sequential(
            nn.Linear(2 * embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid(),
        )

    def forward(self, user_ids, item_ids):
        u = self.user_embedding(user_ids)
        i = self.item_embedding(item_ids)
        x = torch.cat([u, i], dim=-1)
        return self.mlp(x).squeeze(-1)


## Training loop (placeholder)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NeuralCF(num_users=len(user_to_idx), num_items=len(item_to_idx)).to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCELoss()

dataset = InteractionDataset(train_enc, num_items=len(item_to_idx), num_neg=4)
loader = DataLoader(dataset, batch_size=256, shuffle=True)

# TODO: tune epochs/negatives, add validation, early stopping.
for epoch in range(1):
    for users_batch, items_batch, labels_batch in loader:
        users_batch = users_batch.view(-1).to(device)
        items_batch = items_batch.view(-1).to(device)
        labels_batch = labels_batch.view(-1).to(device)

        preds = model(users_batch, items_batch)
        loss = loss_fn(preds, labels_batch)
        optim.zero_grad()
        loss.backward()
        optim.step()
    print(f"Epoch {epoch} loss {loss.item():.4f}")


## Recommendation wrapper


In [None]:
def recommend(user_id: int, k: int) -> List[int]:
    if user_id not in user_to_idx:
        return list(item_to_idx.keys())[:k]
    user_idx_val = torch.tensor([user_to_idx[user_id]], device=device)
    item_indices = torch.arange(len(item_to_idx), device=device)
    user_vec = user_idx_val.repeat(len(item_indices))
    with torch.no_grad():
        scores = model(user_vec, item_indices).cpu().numpy()
    top_items = scores.argsort()[::-1][:k]
    return [idx_to_item[i] for i in top_items]


## Evaluate with shared metrics


In [None]:
ground_truth = build_ground_truth(test_df, user_col=USER_COL, item_col=ITEM_COL)
users_eval = list(ground_truth.keys())

def recommend_wrapped(user, k):
    known = train_df.loc[train_df[USER_COL] == user, ITEM_COL].tolist()
    recs = recommend(user, k + len(known))
    return [itm for itm in recs if itm not in known][:k]

# Might be slow; consider subsampling users for quick checks.
results = evaluate_topk(ground_truth, recommend_wrapped, users_eval, ks=[5, 10])
results


TODO: improve negative sampling, add validation split, and move training code into reusable functions if needed.
