In [1]:
!pip install nbimporter
!pip install ipynb

Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl.metadata (252 bytes)
Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
[0m

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from pathlib import Path
import pickle
import nbimporter

In [3]:
# Import architectures
from recommenders_architecture import *

  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [5]:
# ======= Load Pairwise Training Data =======
current_dir = Path.cwd()

pairwise_data_path= current_dir.parent / "data" / "pairwise"/"pairwise_training_data_no_timestamp.csv"
df = pd.read_csv(pairwise_data_path)

# ======= Load Item Metadata (1027-dim vectors) =======
models_dir = current_dir.parent / "models" 
encoded_dir = current_dir.parent / "data" / "encoded"
encoded_text_file = encoded_dir / "embedding_dict_with_price_longformer_idx.pt"
encoded_images_file = encoded_dir / "images_encodings.pkl"
encoded_metadata_text_image_file = encoded_dir / "item_metadata_text_image.pt"
auto_encoder_metadata_file = encoded_dir / "compressed_all_data_encodings_256.pkl"
user_embed_model_path = models_dir / "Yahlly_Optuna_23_2_MFBiases_0_0.9449033475350068.pth"
text_embeddings = torch.load(encoded_text_file)

with open(encoded_images_file, 'rb') as f:
    images_embeddings = pickle.load(f)


In [32]:

# ======= Configurations =======
EMBEDDING_DIM = 24  # User embedding size
ITEM_FEATURE_DIM = 3075# item_metadata[0].shape # Length of item metadata vector (text+image)
#ITEM_FEATURE_DIM = 256 # After autoencoder
BATCH_SIZE = 512
EPOCHS = 10
LR = 0.00001  # Learning rate
VAL_SPLIT = 0.1

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### if we choose to use Auto encoder data instead of the long text embeddings

In [13]:
# with open(auto_encoder_metadata_file, 'rb') as f:
#     item_metadata = pickle.load(f)

In [92]:

# item_embeddings_tensor = torch.zeros(len(item_metadata), item_metadata[0].size(0))
# for idx, (item_id, embed) in enumerate(item_metadata.items()):
#     item_embeddings_tensor[idx] = embed


In [93]:
# auto_encoder_metadata_file_pt_out = encoded_dir / "compressed_all_data_encodings_256.pt"

In [94]:
# torch.save(item_embeddings_tensor,auto_encoder_metadata_file_pt_out )

### Load pretrained embeddings from MF model

In [26]:
user_item_file_path = current_dir.parent / "data" / "data_and_test_files" / "user_item_rating_table_train_with_idx.csv"
df2 = pd.read_csv(user_item_file_path)

In [29]:
model_path = current_dir.parent / "models" / "Yahlly_24_2_MF_Frozen_Biases_18_0.934416908145054.pth"

model = torch.load(model_path, map_location=device)  # Load the entire model object
model.eval()  # Set to evaluation mode

MFWithBiasesFreeze(
  (user_bias): Embedding(1096901, 1)
  (item_bias): Embedding(198771, 1)
  (user_embedding): Embedding(1096901, 24)
  (item_embedding): Embedding(198771, 24)
)

In [30]:
initial_user_embed = model.user_embedding
initial_item_embed = model.item_embedding
initial_user_bias = model.user_bias
initial_item_bias = model.item_bias
initial_global_bias = model.global_bias

In [34]:
initial_item_embed

Embedding(198771, 24)

### Concat image and text embeddings

##### Create concat of image and text

In [47]:
# import torch

# # ======= Create Combined Item Metadata =======
# item_metadata = {}

# for item_id in text_embeddings.keys():
#     text_embed = text_embeddings[item_id]  # (1027,)
#     image_embed = images_embeddings.get(item_id, torch.zeros(2048))  # (2048,) default to zeros if missing

#     # Concatenate along the feature dimension
#     combined_embed = torch.cat([text_embed, image_embed], dim=0)  # (3075,)
#     item_metadata[item_id] = combined_embed

# # Save the combined metadata dictionary
# metadata_save_path = encoded_dir / "item_metadata_text_image.pt"
# torch.save(item_metadata, metadata_save_path)

# print(f"✅ Item metadata saved to {metadata_save_path} with {len(item_metadata)} items.")


✅ Item metadata saved to /storage/yahlly/RecSys/data/encoded/item_metadata_text_image.pt with 198771 items.


### Load item_metadata (text+image embeddings)

In [10]:
item_metadata = torch.load( encoded_dir / "item_metadata_text_image.pt")

#### make the item metadata a tensor

In [15]:
item_embeddings_tensor = torch.zeros(len(item_metadata), item_metadata[0].size(0))
for idx, (item_id, embed) in enumerate(item_metadata.items()):
    item_embeddings_tensor[idx] = embed


In [16]:
item_embeddings_tensor=item_embeddings_tensor.to(device)

### Data loader class

In [18]:
# ======= Custom Dataset Class =======
class PairwiseDataset(Dataset):
    def __init__(self, dataframe):
        self.users = dataframe["user_id"].values
        self.item1 = dataframe["item1_id"].values
        self.item2 = dataframe["item2_id"].values
        self.labels = dataframe["label"].values

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.item1[idx],
            self.item2[idx],
            self.labels[idx],
        )


In [19]:
# ======= Two-Tower Model (User & Item Networks) =======
class TwoTowerModelPretrainedUserEmbeddings(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Embedding)
        self.user_embedding = nn.Embedding(num_users, embedding_dim) 
        ### LOAD PRETRAINED USER EMBEDDINGS
        self.user_embedding.weight.data.copy_(initial_user_embed.weight.data)

        
        # Item Tower (Using Item Metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )
    
    def forward(self, user_ids, item1_ids, item2_ids):
        item1_ids=item1_ids.to(device)
        item2_ids=item2_ids.to(device)
        user_embed = self.user_embedding(user_ids)  # (batch, embedding_dim)
        item1_embed = self.item_fc(item_embeddings_tensor[item1_ids])  # (batch, embedding_dim)
        item2_embed = self.item_fc(item_embeddings_tensor[item2_ids])  # (batch, embedding_dim)
        
        return user_embed, item1_embed, item2_embed

# ======= Pairwise BPR Loss =======


class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -torch.log(torch.sigmoid(pos_score - neg_score)).mean()
        return loss


In [20]:
df.head()

Unnamed: 0,user_id,item1_id,item2_id,label
0,0,50787,0,0
1,0,1,33482,1
2,0,35983,2,2
3,0,3,125581,3
4,0,4,60296,4


In [21]:
val_size = int(len(df) * VAL_SPLIT)
train_df, val_df = df[:-val_size], df[-val_size:] #df[:-val_size], df[-val_size:]
# ======= Dataloaders =======
train_loader = DataLoader(PairwiseDataset(train_df), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(PairwiseDataset(val_df), batch_size=BATCH_SIZE, shuffle=False)


In [22]:
num_users = 1096901
num_items = 198771


In [33]:

# ======= Initialize Model, Loss, Optimizer =======
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerModelPretrainedUserEmbeddings(num_users, num_items, EMBEDDING_DIM, ITEM_FEATURE_DIM).to(device)
criterion = BPRLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ======= Training Loop =======
# ======= Training & Validation =======
log_file = "cold_training_log.txt"

print("🚀 Training Model...")
with open(log_file, "w") as log:
    log.write("🚀 Training Model...\n")
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0
    
        for user_ids, item1_ids, item2_ids,labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            user_ids, item1_ids, item2_ids,labels = user_ids.to(device), item1_ids.to(device), item2_ids.to(device), labels.to(device)
    
            # Forward Pass
            user_embed, item1_embed, item2_embed = model(user_ids, item1_ids, item2_ids)
            #print(item1_embed==item2_embed)
            # Compute Loss
            # print(item1_embed==item2_embed)
            loss = criterion(user_embed,item1_ids, item1_embed,item2_ids, item2_embed, labels)
            train_loss += loss.item()
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss = train_loss /len(train_loader)
        # ======= Validation =======
        model.eval()
        correct = 0
        total = 0
        val_loss = 0
        for user_ids, item1_ids, item2_ids, labels in val_loader:
            user_ids, item1_ids, item2_ids, labels = (
                user_ids.to(device),
                item1_ids.to(device),
                item2_ids.to(device),
                labels.to(device),
            )
            user_embed, item1_embed, item2_embed = model(user_ids, item1_ids, item2_ids)
            #print((item1_embed==item2_embed).all())
            score1 = (user_embed * item1_embed).sum(dim=1)  # Score for item1
            score2 = (user_embed * item2_embed).sum(dim=1)  # Score for item2
    
            # Determine the correct positive and negative scores based on labels
            labels_binary = (labels == item1_ids).float()
            #print(labels_binary)
            pos_scores = torch.where(labels_binary == 1, score1, score2)
            neg_scores = torch.where(labels_binary == 1, score2, score1)
            #print(pos_scores)
            # Check if the model correctly ranked the positive item higher
            loss = criterion(user_embed,item1_ids, item1_embed,item2_ids, item2_embed, labels)
            val_loss += loss.item()
            predictions = pos_scores > neg_scores
    
            correct += predictions.sum().item()
            total += predictions.shape[0]
    
        val_accuracy = correct / total
        val_loss=val_loss/len(val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f},Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")
        log.write(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}\n")



# ======= Save Model =======
#torch.save(model.state_dict(), "trained_model.pth")
print("✅ Model Training Complete!")
with open(log_file, "a") as log:
    log.write("✅ Model Training Complete!\n")

🚀 Training Model...


Epoch 1/10: 100%|██████████| 16045/16045 [01:24<00:00, 189.33it/s]


Epoch 1: Train Loss = 0.6932,Val Loss = 0.6932, Val Accuracy = 0.4996


Epoch 2/10: 100%|██████████| 16045/16045 [01:24<00:00, 190.09it/s]


Epoch 2: Train Loss = 0.6918,Val Loss = 0.6937, Val Accuracy = 0.4993


Epoch 3/10: 100%|██████████| 16045/16045 [01:23<00:00, 192.15it/s]


Epoch 3: Train Loss = 0.6824,Val Loss = 0.6992, Val Accuracy = 0.4993


Epoch 4/10: 100%|██████████| 16045/16045 [01:24<00:00, 189.68it/s]


Epoch 4: Train Loss = 0.6592,Val Loss = 0.7113, Val Accuracy = 0.4992


Epoch 5/10: 100%|██████████| 16045/16045 [01:24<00:00, 189.37it/s]


Epoch 5: Train Loss = 0.6278,Val Loss = 0.7233, Val Accuracy = 0.4992


Epoch 6/10: 100%|██████████| 16045/16045 [01:24<00:00, 190.88it/s]


Epoch 6: Train Loss = 0.5960,Val Loss = 0.7331, Val Accuracy = 0.4989


Epoch 7/10: 100%|██████████| 16045/16045 [01:23<00:00, 192.11it/s]


Epoch 7: Train Loss = 0.5661,Val Loss = 0.7404, Val Accuracy = 0.4990


Epoch 8/10: 100%|██████████| 16045/16045 [01:23<00:00, 191.90it/s]


Epoch 8: Train Loss = 0.5382,Val Loss = 0.7466, Val Accuracy = 0.4990


Epoch 9/10: 100%|██████████| 16045/16045 [01:23<00:00, 192.85it/s]


Epoch 9: Train Loss = 0.5124,Val Loss = 0.7516, Val Accuracy = 0.4991


Epoch 10/10: 100%|██████████| 16045/16045 [01:24<00:00, 190.87it/s]


Epoch 10: Train Loss = 0.4884,Val Loss = 0.7556, Val Accuracy = 0.4989
✅ Model Training Complete!
