In [1]:
from typing import Tuple, Callable

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import os

Make sure that results are reproducible by using a seed.

In [2]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [4]:
DATA_DIR = r"C:\Users\loris\OneDrive\ETH\Group Project"


def read_data_df():
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    df["rating"] = df["rating"].astype(float)
    return train_test_split(df[["sid", "pid", "rating"]], test_size=0.25, random_state=SEED)


def read_data_matrix(df: pd.DataFrame) -> np.ndarray:
    """Returns matrix view of the training data, where columns are scientists (sid) and
    rows are papers (pid)."""

    return df.pivot(index="sid", columns="pid", values="rating").values


def evaluate(valid_df: pd.DataFrame, pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray]) -> float:
    """
    Inputs:
        valid_df: Validation data, returned from read_data_df for example.
        pred_fn: Function that takes in arrays of sid and pid and outputs their rating predictions.

    Outputs: Validation RMSE
    """
    
    preds = pred_fn(valid_df["sid"].values, valid_df["pid"].values)
    return root_mean_squared_error(valid_df["rating"].values, preds)


def make_submission(pred_fn: Callable[[np.ndarray, np.ndarray], np.ndarray], filename: os.PathLike):
    """Makes a submission CSV file that can be submitted to kaggle.

    Inputs:
        pred_fn: Function that takes in arrays of sid and pid and outputs a score.
        filename: File to save the submission to.
    """
    
    df = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

    # Get sids and pids
    sid_pid = df["sid_pid"].str.split("_", expand=True)
    sids = sid_pid[0]
    pids = sid_pid[1]
    sids = sids.astype(int).values
    pids = pids.astype(int).values
    
    df["rating"] = pred_fn(sids, pids)
    df.to_csv(filename, index=False)


def impute_values(mat: np.ndarray) -> np.ndarray:
    return np.nan_to_num(mat, nan=3.0)



train_df, valid_df = read_data_df()
train_mat = read_data_matrix(train_df)
train_mat = impute_values(train_mat)

In [None]:
class SVDpp(nn.Module):
    def __init__(self, num_scientists: int = 10000, num_papers: int = 10000, emb_dim: int = 32, s2p: dict = dict(), global_mean: torch.float32 = 3.82):
        super().__init__()
        self.emb_dim = emb_dim
        self.s2p = s2p

        # embeddings for scientists and papers
        self.scientist_factors = nn.Embedding(num_scientists, emb_dim)
        self.paper_factors = nn.Embedding(num_papers, emb_dim)
        self.scientist_bias = nn.Embedding(num_scientists, 1)
        self.paper_bias = nn.Embedding(num_papers, 1)
        self.implicit_factors = nn.Embedding(num_papers, emb_dim)

        # global average rating - TODO: maybe come up with smth better
        self.global_bias = nn.Parameter(torch.tensor([global_mean]), requires_grad=False)

        # init weights - TODO: not tuned rn
        nn.init.normal_(self.scientist_factors.weight, std=0.1)
        nn.init.normal_(self.paper_factors.weight, std=0.1)
        nn.init.normal_(self.implicit_factors.weight, std=0.1)
        nn.init.constant_(self.scientist_bias.weight, 0.0)
        nn.init.constant_(self.paper_bias.weight, 0.0)

    def forward(self, scientist_ids, paper_ids):
        # latent factors and biases for current batch
        scientist_embeddings = self.scientist_factors(scientist_ids)
        paper_embeddings = self.paper_factors(paper_ids)
        # squeeze to remove extra dim
        scientist_biases = self.scientist_bias(scientist_ids).squeeze()
        paper_biases = self.paper_bias(paper_ids).squeeze()

        papers = [self.s2p.get(k, []) for k in scientist_ids]

        implicit_embeds = []
        for sp in papers:
            if len(sp) > 0:
                y_j = self.implicit_factors(torch.tensor(sp, device=scientist_ids.device))
                sum_yj = y_j.sum(dim=0)
                norm_yj = sum_yj / torch.sqrt(torch.tensor(len(sp), dtype=torch.float, device=scientist_ids.device))
            else:
                norm_yj = torch.zeros_like(scientist_embeddings[0])
            implicit_embeds.append(norm_yj)
        y_u = torch.stack(implicit_embeds)

        # dot product for interaction
        interaction = ((scientist_embeddings + y_u)  * paper_embeddings).sum(dim=1)

        # predict ratings
        predicted_ratings = interaction + scientist_biases + paper_biases + self.global_bias
        return predicted_ratings

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

Using: cpu


In [6]:
#NEW
def get_embeddings(model, sid, pid):
    return model.scientist_emb(sid), model.paper_emb(pid)

class TripletDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.triplets = self._generate_triplets()

    def _generate_triplets(self):
        triplets = []
        grouped = self.df.groupby("sid")
        for sid, group in grouped:
            pos = group[group["rating"] >= 4]["pid"].values
            neg = group[group["rating"] <= 2]["pid"].values
            if len(pos) > 0 and len(neg) > 0:
                for p in pos:
                    for n in neg:
                        triplets.append((sid, p, n))
        return triplets

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        sid, pid_pos, pid_neg = self.triplets[idx]
        return torch.tensor(sid), torch.tensor(pid_pos), torch.tensor(pid_neg)


In [None]:
#NEW
triplet_dataset = TripletDataset(train_df)
triplet_loader = torch.utils.data.DataLoader(triplet_dataset, batch_size=64, shuffle=True)

triplet_loss_fn = nn.TripletMarginLoss(margin=1.0)


In [9]:
class EmbeddingDotProductModel(nn.Module):
    def __init__(self, num_scientists: int, num_papers: int, dim: int):
        super().__init__()

        # Assign to each scientist and paper an embedding
        self.scientist_emb = nn.Embedding(num_scientists, dim)
        self.paper_emb = nn.Embedding(num_papers, dim)
        
    def forward(self, sid: torch.Tensor, pid: torch.Tensor) -> torch.Tensor:
        """
        Inputs:
            sid: [B,], int
            pid: [B,], int
        
        Outputs: [B,], float
        """

        # Per-pair dot product
        return torch.sum(self.scientist_emb(sid) * self.paper_emb(pid), dim=-1)

Set $d=32$.

In [10]:
# Define model (10k scientists, 1k papers, 32-dimensional embeddings) and optimizer
model = EmbeddingDotProductModel(10_000, 1_000, 32).to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)

In [14]:
def get_dataset(df: pd.DataFrame) -> torch.utils.data.Dataset:
    """Conversion from pandas data frame to torch dataset."""
    
    sids = torch.from_numpy(df["sid"].to_numpy())
    pids = torch.from_numpy(df["pid"].to_numpy())
    ratings = torch.from_numpy(df["rating"].to_numpy()).float()
    return torch.utils.data.TensorDataset(sids, pids, ratings)

In [11]:
# Load wishlist data
wishlist_df = pd.read_csv(os.path.join(DATA_DIR, "train_tbr.csv"))

# No need to split any columns — they're already 'sid' and 'pid'
wishlist_df["sid"] = wishlist_df["sid"].astype(int)
wishlist_df["pid"] = wishlist_df["pid"].astype(int)

# Assign soft rating
wishlist_df["rating"] = 3.5

# Load and clean the original ratings
train_df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))
train_df[["sid", "pid"]] = train_df["sid_pid"].str.split("_", expand=True)
train_df["sid"] = train_df["sid"].astype(int)
train_df["pid"] = train_df["pid"].astype(int)
train_df["rating"] = train_df["rating"].astype(float)

# Combine both into one training set
augmented_train_df = pd.concat([train_df[["sid", "pid", "rating"]], wishlist_df], ignore_index=True)


In [15]:
train_dataset = get_dataset(train_df)
valid_dataset = get_dataset(valid_df)
train_loader = torch.utils.data.DataLoader(get_dataset(augmented_train_df), batch_size=64, shuffle=True)
#train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=64, shuffle=False)

Training loop, which we run for 5 epochs.

In [None]:
train_loader = torch.utils.data.DataLoader(get_dataset(augmented_train_df), batch_size=64, shuffle=True)

NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    total = 0
    for sid, pid, rating in train_loader:
        sid, pid, rating = sid.to(device), pid.to(device), rating.to(device)
        pred = model(sid, pid)
        loss = F.mse_loss(pred, rating)
            # --- Contrastive loss step ---
    for sid, pid_pos, pid_neg in triplet_loader:
        sid = sid.to(device)
        pid_pos = pid_pos.to(device)
        pid_neg = pid_neg.to(device)

        anchor = model.scientist_emb(sid)
        positive = model.paper_emb(pid_pos)
        negative = model.paper_emb(pid_neg)

        contrastive = triplet_loss_fn(anchor, positive, negative)

        optim.zero_grad()
        contrastive.backward()
        optim.step()

        total_loss += loss.item() * len(sid)
        total += len(sid)
    print(f"[Epoch {epoch+1}] Train RMSE: {(total_loss / total)**0.5:.3f}")


[Epoch 1] Train RMSE: 5.268
[Epoch 2] Train RMSE: 1.935
[Epoch 3] Train RMSE: 0.872
[Epoch 4] Train RMSE: 0.854
[Epoch 5] Train RMSE: 0.842


In [17]:
#NEW
model.eval()
def pred_fn(sids, pids):
    with torch.no_grad():
        sids = torch.tensor(sids).to(device)
        pids = torch.tensor(pids).to(device)
        return model(sids, pids).clamp(1, 5).cpu().numpy()

val_rmse = evaluate(valid_df, pred_fn)
print(f"Validation RMSE: {val_rmse:.3f}")

make_submission(pred_fn, "dot_product_contrastive_submission.csv")


Validation RMSE: 0.891


As we can see, this method already provides an improvement on the validation dataset over the naive SVD method.

In [None]:
pred_fn = lambda sids, pids: model(torch.from_numpy(sids).to(device), torch.from_numpy(pids).to(device)).clamp(1, 5).cpu().numpy()

# Evaluate on validation data
with torch.no_grad():
    val_score = evaluate(valid_df, pred_fn)

print(f"Validation RMSE: {val_score:.3f}")

In [None]:
with torch.no_grad():
    make_submission(pred_fn, "learned_embedding_submission.csv")