In [1]:
!pip install nbimporter

[0m

In [2]:
!pip install ipynb

[0m

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from pathlib import Path
import pickle

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
import nbimporter

In [6]:
# Import architectures
from recommenders_architecture import *


  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [7]:
# ======= Load Pairwise Training Data =======
current_dir = Path.cwd()

pairwise_data_train_path= current_dir.parent / "data" / "pairwise"/"pairwise_train.csv"
train = pd.read_csv(pairwise_data_train_path)
pairwise_data_val_path= current_dir.parent / "data" / "pairwise"/"pairwise_val.csv"
val = pd.read_csv(pairwise_data_val_path)
# ======= Load Item Metadata (1027-dim vectors) =======
encoded_dir = current_dir.parent / "data" / "encoded"


In [10]:
model_path = current_dir.parent / "models" / "Yahlly_10_3_NCF_with_Metadata_biases_2_0.9221274085422783.pth"

model = torch.load(model_path, map_location=device)  # Load the entire model object
model.eval()  # Set to evaluation mode

NCFWithMetadata(
  (user_embedding_gmf): Embedding(1096901, 24)
  (item_embedding_gmf): Embedding(198771, 24)
  (user_embedding_mlp): Embedding(1096901, 24)
  (item_embedding_mlp): Embedding(198771, 24)
  (item_proj_gmf): Linear(in_features=280, out_features=24, bias=True)
  (item_proj_mlp): Linear(in_features=280, out_features=24, bias=True)
  (gmf_layer): Linear(in_features=24, out_features=1, bias=True)
  (mlp): Sequential(
    (0): Linear(in_features=48, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2549127797233314, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2549127797233314, inplace=False)
  )
  (final_layer): Linear(in_features=129, out_features=1, bias=True)
)

In [16]:
df

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,0,13349,0,1349041740000,5.0
1,0,22959,1,1,1370958618000,1.0
2,0,97562,2,2,1440038761000,5.0
3,0,23003,3,3,1483320893000,3.0
4,0,16177,4,4,1490800837000,5.0
...,...,...,...,...,...,...
9127372,1096899,45300,92761,92761,1692552496934,5.0
9127373,1096900,183765,86867,183765,1600792118191,1.0
9127374,1096900,155119,99585,155119,1615811081145,1.0
9127375,1096900,25515,75800,25515,1693494834857,4.0


In [30]:
item_metadata[0].shape

torch.Size([256])

In [16]:

# # ======= Configurations =======
# EMBEDDING_DIM = 128  # User embedding size
# ITEM_FEATURE_DIM = item_metadata[0].shape # Length of item metadata vector (text+image)
# BATCH_SIZE = 512
# EPOCHS = 10
# LR = 0.0001  # Learning rate
# VAL_SPLIT = 0.1


# ======= Configurations =======
EMBEDDING_DIM = 24  # User embedding size
#ITEM_FEATURE_DIM = 3075# item_metadata[0].shape # Length of item metadata vector (text+image)
ITEM_FEATURE_DIM = 256 # After autoencoder
BATCH_SIZE = 512
EPOCHS = 10
LR = 0.00001  # Learning rate
VAL_SPLIT = 0.1

In [14]:
# ======= Custom Dataset Class =======
class PairwiseDataset(Dataset):
    def __init__(self, dataframe):
        self.users = dataframe["user_id"].values
        self.item1 = dataframe["item1_id"].values
        self.item2 = dataframe["item2_id"].values
        self.labels = dataframe["label"].values

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.item1[idx],
            self.item2[idx],
            self.labels[idx],
        )


In [39]:
train

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,0,13349,0,1349041740000,5.0
1,0,22959,1,1,1370958618000,1.0
2,0,97562,2,2,1440038761000,5.0
3,0,23003,3,3,1483320893000,3.0
4,1,179127,5,5,1600753653091,5.0
...,...,...,...,...,...,...
8030471,1096899,26803,32852,32852,1692552324736,5.0
8030472,1096899,177842,10643,10643,1692552357767,5.0
8030473,1096900,183765,86867,183765,1600792118191,1.0
8030474,1096900,155119,99585,155119,1615811081145,1.0


In [41]:
val

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,16177,4,4,1490800837000,5.0
1,1,10,174536,10,1676601720832,2.0
2,2,42860,16,16,1588626339041,5.0
3,3,20877,29,29,1605455790941,5.0
4,4,17870,41,41,1638039645551,5.0
...,...,...,...,...,...,...
1096896,1096896,197,11404,11404,1693892929945,5.0
1096897,1096897,32215,161020,161020,1617640776113,5.0
1096898,1096898,9974,33337,9974,1691348903005,5.0
1096899,1096899,45300,92761,92761,1692552496934,5.0


In [17]:
train_loader = DataLoader(PairwiseDataset(train), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(PairwiseDataset(val), batch_size=BATCH_SIZE, shuffle=False)

In [18]:
num_users = 1096901
num_items = 198771


In [25]:

# ======= Initialize Model, Loss, Optimizer =======
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
correct = 0
total = 0
val_loss = 0
for user_ids, item1_ids, item2_ids, labels in tqdm(train_loader):
    user_ids, item1_ids, item2_ids, labels = (
        user_ids.to(device),
        item1_ids.to(device),
        item2_ids.to(device),
        labels.to(device),
    )
    score1 = model(user_ids, item1_ids)
    score2 = model(user_ids, item2_ids)
    
    # Determine the correct positive and negative scores based on labels
    labels_binary = (labels == item1_ids).float()
    #print(labels_binary)
    pos_scores = torch.where(labels_binary == 1, score1, score2)
    neg_scores = torch.where(labels_binary == 1, score2, score1)
    #print(pos_scores)
    # Check if the model correctly ranked the positive item higher
    predictions = pos_scores > neg_scores
    correct += predictions.sum().item()
    total += predictions.shape[0]
   
val_accuracy = correct / total
val_loss=val_loss/len(val_loader)
print(f"Val Accuracy = {val_accuracy:.4f}")


100%|██████████| 15685/15685 [25:07<00:00, 10.40it/s]

Val Accuracy = 0.4560



