In [1]:
!pip install nbimporter

Collecting nbimporter
  Downloading nbimporter-0.3.4-py3-none-any.whl.metadata (252 bytes)
Downloading nbimporter-0.3.4-py3-none-any.whl (4.9 kB)
Installing collected packages: nbimporter
Successfully installed nbimporter-0.3.4
[0m

In [2]:
!pip install ipynb

Collecting ipynb
  Downloading ipynb-0.5.1-py3-none-any.whl.metadata (303 bytes)
Downloading ipynb-0.5.1-py3-none-any.whl (6.9 kB)
Installing collected packages: ipynb
Successfully installed ipynb-0.5.1
[0m

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from pathlib import Path
import pickle

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import nbimporter

In [5]:
# Import architectures
from recommenders_architecture import *


  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [38]:
# ======= Load Pairwise Training Data =======
current_dir = Path.cwd()

pairwise_data_path= current_dir.parent / "data" / "pairwise"/"pairwise_data.csv"
df = pd.read_csv(pairwise_data_path)
pairwise_data_train_path= current_dir.parent / "data" / "pairwise"/"pairwise_train.csv"
train = pd.read_csv(pairwise_data_train_path)
pairwise_data_val_path= current_dir.parent / "data" / "pairwise"/"pairwise_val.csv"
val = pd.read_csv(pairwise_data_val_path)
# ======= Load Item Metadata (1027-dim vectors) =======
encoded_dir = current_dir.parent / "data" / "encoded"
encoded_text_file = encoded_dir / "embedding_dict_with_price_longformer_idx.pt"
encoded_images_file = encoded_dir / "images_encodings.pkl"
encoded_metadata_text_image_file = encoded_dir / "item_metadata_text_image.pt"
text_embeddings = torch.load(encoded_text_file)

with open(encoded_images_file, 'rb') as f:
    images_embeddings = pickle.load(f)


In [13]:
text_embeddings[131488].shape

torch.Size([1027])

In [14]:
images_embeddings[90788].shape

torch.Size([2048])

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
user_item_file_path = current_dir.parent / "data" / "data_and_test_files" / "user_item_rating_table_train_with_idx.csv"
df2 = pd.read_csv(user_item_file_path)

In [12]:
model_path = current_dir.parent / "models" / "Yahlly_24_2_MF_Frozen_Biases_18_0.934416908145054.pth"

model = torch.load(model_path, map_location=device)  # Load the entire model object
model.eval()  # Set to evaluation mode

MFWithBiasesFreeze(
  (user_bias): Embedding(1096901, 1)
  (item_bias): Embedding(198771, 1)
  (user_embedding): Embedding(1096901, 24)
  (item_embedding): Embedding(198771, 24)
)

In [13]:
initial_user_embed = model.user_embedding
initial_item_embed = model.item_embedding
initial_user_bias = model.user_bias
initial_item_bias = model.item_bias
initial_global_bias = model.global_bias

In [14]:
auto_encoder_metadata_file= encoded_dir / "compressed_all_data_encodings_256.pt"

In [29]:
item_metadata = torch.load( auto_encoder_metadata_file)

In [16]:
df

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,0,13349,0,1349041740000,5.0
1,0,22959,1,1,1370958618000,1.0
2,0,97562,2,2,1440038761000,5.0
3,0,23003,3,3,1483320893000,3.0
4,0,16177,4,4,1490800837000,5.0
...,...,...,...,...,...,...
9127372,1096899,45300,92761,92761,1692552496934,5.0
9127373,1096900,183765,86867,183765,1600792118191,1.0
9127374,1096900,155119,99585,155119,1615811081145,1.0
9127375,1096900,25515,75800,25515,1693494834857,4.0


### Concat image and text embeddings

In [13]:
import torch

# ======= Create Combined Item Metadata =======
item_metadata = {}

for item_id in text_embeddings.keys():
    text_embed = text_embeddings[item_id]  # (1027,)
    image_embed = images_embeddings.get(item_id, torch.zeros(2048))  # (2048,) default to zeros if missing

    # Concatenate along the feature dimension
    combined_embed = torch.cat([text_embed, image_embed], dim=0)  # (3075,)
    item_metadata[item_id] = combined_embed

# Save the combined metadata dictionary
metadata_save_path = encoded_dir / "item_metadata_text_image.pt"
torch.save(item_metadata, metadata_save_path)

print(f"✅ Item metadata saved to {metadata_save_path} with {len(item_metadata)} items.")


  images_embeddings = {k: torch.tensor(v) for k, v in images_embeddings.items()}


✅ Item metadata saved to /storage/yahlly/RecSys/data/encoded/item_metadata_text_image.pt with shape torch.Size([198771, 3075])


In [30]:
item_metadata[0].shape

torch.Size([256])

### Load item_metadata (text+image embeddings)

In [17]:
item_metadata = torch.load(encoded_text_file)

In [27]:
item_metadata.shape

AttributeError: 'dict' object has no attribute 'shape'

In [31]:

# # ======= Configurations =======
# EMBEDDING_DIM = 128  # User embedding size
# ITEM_FEATURE_DIM = item_metadata[0].shape # Length of item metadata vector (text+image)
# BATCH_SIZE = 512
# EPOCHS = 10
# LR = 0.0001  # Learning rate
# VAL_SPLIT = 0.1


# ======= Configurations =======
EMBEDDING_DIM = 24  # User embedding size
#ITEM_FEATURE_DIM = 3075# item_metadata[0].shape # Length of item metadata vector (text+image)
ITEM_FEATURE_DIM = 256 # After autoencoder
BATCH_SIZE = 512
EPOCHS = 10
LR = 0.00001  # Learning rate
VAL_SPLIT = 0.1

In [32]:
# ======= Custom Dataset Class =======
class PairwiseDataset(Dataset):
    def __init__(self, dataframe):
        self.users = dataframe["user_id"].values
        self.item1 = dataframe["item1_id"].values
        self.item2 = dataframe["item2_id"].values
        self.labels = dataframe["label"].values

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.item1[idx],
            self.item2[idx],
            self.labels[idx],
        )


In [33]:
item_metadata=item_metadata.to(device)

In [34]:
# ======= Two-Tower Model (User & Item Networks) =======

class TwoTowerModelPrevEmbedInit(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
        super(TwoTowerModel, self).__init__()
        
        # User Tower (Embedding)
        self.user_embedding = nn.Embedding(num_users, embedding_dim)  # LOAD PRETRAINED USER EMBEDDINGS
        self.item_embedding = nn.Embedding(num_items, embedding_dim)  # LOAD PRETRAINED ITEM EMBEDDINGS
        self.user_embedding.weight.data.copy_(initial_user_embed.weight.data)
        self.item_embedding.weight.data.copy_(initial_item_embed.weight.data)
        # Item Tower (Using Item Metadata)
        self.item_fc = nn.Sequential(
            nn.Linear(item_metadata_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )

        # Second-Level Item Embedding Combination
        self.item_fc2 = nn.Sequential(
            nn.Linear(2 * embedding_dim, 512),  # Concatenating two embedding sources
            nn.ReLU(),
            nn.Linear(512, embedding_dim),
        )
    
    def forward(self, user_ids, item1_ids, item2_ids):
        # User embedding
        user_ids=user_ids.to(device)
        item1_ids=item1_ids.to(device)
        item2_ids=item2_ids.to(device)
        user_embed = self.user_embedding(user_ids)  # (batch, embedding_dim)

        # Item metadata-based embedding
        item1_meta_embed = self.item_fc(item_metadata[item1_ids])  # (batch, embedding_dim)
        item2_meta_embed = self.item_fc(item_metadata[item2_ids])  # (batch, embedding_dim)

        # Item ID-based embedding (pretrained)
        item1_id_embed = self.item_embedding(item1_ids)  # (batch, embedding_dim)
        item2_id_embed = self.item_embedding(item2_ids)  # (batch, embedding_dim)

        # Concatenate metadata-based and ID-based embeddings
        item1_combined = torch.cat([item1_meta_embed, item1_id_embed], dim=1)  # (batch, 2*embedding_dim)
        item2_combined = torch.cat([item2_meta_embed, item2_id_embed], dim=1)  # (batch, 2*embedding_dim)

        # Second-Level Representation Learning
        item1_embed_level2 = self.item_fc2(item1_combined)  # (batch, embedding_dim)
        item2_embed_level2 = self.item_fc2(item2_combined)  # (batch, embedding_dim)

        return user_embed, item1_embed_level2, item2_embed_level2



class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, user_embed, item1_ids, item1_embed, item2_ids, item2_embed, labels):
        """
        Compute Bayesian Personalized Ranking (BPR) loss.

        Args:
        - user_embed: Tensor of shape (batch_size, embed_dim), user embeddings.
        - item1_ids: Tensor of shape (batch_size,), IDs of item1.
        - item1_embed: Tensor of shape (batch_size, embed_dim), embeddings for item1.
        - item2_ids: Tensor of shape (batch_size,), IDs of item2.
        - item2_embed: Tensor of shape (batch_size, embed_dim), embeddings for item2.
        - labels: Tensor of shape (batch_size,), IDs of the correct (positive) item.

        Returns:
        - loss: Computed BPR loss.
        """
        # Convert labels to binary: 1 if item1 is the positive item, else 0
        labels_binary = (labels == item1_ids).float()

        # Compute scores
        score1 = (user_embed * item1_embed).sum(dim=1)  # Affinity score for item1
        score2 = (user_embed * item2_embed).sum(dim=1)  # Affinity score for item2

        # Assign correct positive and negative scores based on labels_binary
        pos_score = torch.where(labels_binary == 1, score1, score2)
        neg_score = torch.where(labels_binary == 1, score2, score1)

        # Compute BPR loss
        loss = -torch.log(torch.sigmoid(pos_score - neg_score)).mean()
        return loss


In [40]:
# ### MODEL WITH BIASES

# import torch
# import torch.nn as nn

# class TwoTowerModel(nn.Module):
#     def __init__(self, num_users, num_items, embedding_dim, item_metadata_dim):
#         super(TwoTowerModel, self).__init__()
        
#         # User Tower (Embedding)
#         self.user_embedding = nn.Embedding(num_users, embedding_dim)  # LOAD PRETRAINED USER EMBEDDINGS
#         self.item_embedding = nn.Embedding(num_items, embedding_dim)  # LOAD PRETRAINED ITEM EMBEDDINGS

#         # Bias terms
#         self.user_bias = nn.Embedding(num_users, 1)  # Bias for each user
#         self.item_bias = nn.Embedding(num_items, 1)  # Bias for each item

#         # Item Tower (Using Item Metadata)
#         self.item_fc = nn.Sequential(
#             nn.Linear(item_metadata_dim, 512),
#             nn.ReLU(),
#             nn.Linear(512, embedding_dim),
#         )

#         # Second-Level Item Embedding Combination
#         self.item_fc2 = nn.Sequential(
#             nn.Linear(2 * embedding_dim, 512),  # Concatenating two embedding sources
#             nn.ReLU(),
#             nn.Linear(512, embedding_dim),
#         )
    
#     def forward(self, user_ids, item1_ids, item2_ids, item_metadata):
#         # User embedding and bias
#         user_embed = self.user_embedding(user_ids)  # (batch, embedding_dim)
#         user_bias = self.user_bias(user_ids)  # (batch, 1)

#         # Item metadata-based embedding and bias
#         item1_meta_embed = self.item_fc(item_metadata[item1_ids])  # (batch, embedding_dim)
#         item2_meta_embed = self.item_fc(item_metadata[item2_ids])  # (batch, embedding_dim)

#         item1_meta_bias = self.item_bias(item1_ids)  # (batch, 1)
#         item2_meta_bias = self.item_bias(item2_ids)  # (batch, 1)

#         # Item ID-based embedding (pretrained) and bias
#         item1_id_embed = self.item_embedding(item1_ids)  # (batch, embedding_dim)
#         item2_id_embed = self.item_embedding(item2_ids)  # (batch, embedding_dim)

#         item1_id_bias = self.item_bias(item1_ids)  # (batch, 1)
#         item2_id_bias = self.item_bias(item2_ids)  # (batch, 1)

#         # Concatenate metadata-based and ID-based embeddings
#         item1_combined = torch.cat([item1_meta_embed, item1_id_embed], dim=1)  # (batch, 2*embedding_dim)
#         item2_combined = torch.cat([item2_meta_embed, item2_id_embed], dim=1)  # (batch, 2*embedding_dim)

#         # Second-Level Representation Learning
#         item1_embed_level2 = self.item_fc2(item1_combined)  # (batch, embedding_dim)
#         item2_embed_level2 = self.item_fc2(item2_combined)  # (batch, embedding_dim)

#         # Adding biases to the final embeddings (optional)
#         item1_embed_level2 += item1_meta_bias + item1_id_bias  # (batch, embedding_dim)
#         item2_embed_level2 += item2_meta_bias + item2_id_bias  # (batch, embedding_dim)
#         user_embed += user_bias  # (batch, embedding_dim)

#         return user_embed, item1_embed_level2, item2_embed_level2


In [41]:
# val_size = int(len(df) * VAL_SPLIT)
# train_df, val_df = df[:-val_size], df[-val_size:] #= df[:-val_size], df[-val_size:]
# # ======= Dataloaders =======
# train_loader = DataLoader(PairwiseDataset(train_df), batch_size=BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(PairwiseDataset(val_df), batch_size=BATCH_SIZE, shuffle=False)


In [39]:
train

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,0,13349,0,1349041740000,5.0
1,0,22959,1,1,1370958618000,1.0
2,0,97562,2,2,1440038761000,5.0
3,0,23003,3,3,1483320893000,3.0
4,1,179127,5,5,1600753653091,5.0
...,...,...,...,...,...,...
8030471,1096899,26803,32852,32852,1692552324736,5.0
8030472,1096899,177842,10643,10643,1692552357767,5.0
8030473,1096900,183765,86867,183765,1600792118191,1.0
8030474,1096900,155119,99585,155119,1615811081145,1.0


In [41]:
val

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,16177,4,4,1490800837000,5.0
1,1,10,174536,10,1676601720832,2.0
2,2,42860,16,16,1588626339041,5.0
3,3,20877,29,29,1605455790941,5.0
4,4,17870,41,41,1638039645551,5.0
...,...,...,...,...,...,...
1096896,1096896,197,11404,11404,1693892929945,5.0
1096897,1096897,32215,161020,161020,1617640776113,5.0
1096898,1096898,9974,33337,9974,1691348903005,5.0
1096899,1096899,45300,92761,92761,1692552496934,5.0


In [40]:
train_loader = DataLoader(PairwiseDataset(train), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(PairwiseDataset(val), batch_size=BATCH_SIZE, shuffle=False)

In [23]:
num_users = 1096901
num_items = 198771


In [42]:

# ======= Initialize Model, Loss, Optimizer =======
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTowerModel(num_users, num_items, EMBEDDING_DIM, ITEM_FEATURE_DIM).to(device)
criterion = BPRLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

# ======= Training Loop =======
# ======= Training & Validation =======
log_file = "warm_training_log.txt"

print("🚀 Training Model...")
with open(log_file, "w") as log:
    log.write("🚀 Training Model...\n")
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0
    
        for user_ids, item1_ids, item2_ids,labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            user_ids, item1_ids, item2_ids,labels = user_ids.to(device), item1_ids.to(device), item2_ids.to(device), labels.to(device)
    
            # Forward Pass
            user_embed, item1_embed, item2_embed = model(user_ids, item1_ids, item2_ids)
            #print(item1_embed==item2_embed)
            # Compute Loss
            # print(item1_embed==item2_embed)
            loss = criterion(user_embed,item1_ids, item1_embed,item2_ids, item2_embed, labels)
            train_loss += loss.item()
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss = train_loss /len(train_loader)
        # ======= Validation =======
        model.eval()
        correct = 0
        total = 0
        val_loss = 0
        for user_ids, item1_ids, item2_ids, labels in val_loader:
            user_ids, item1_ids, item2_ids, labels = (
                user_ids.to(device),
                item1_ids.to(device),
                item2_ids.to(device),
                labels.to(device),
            )
            user_embed, item1_embed, item2_embed = model(user_ids, item1_ids, item2_ids)
            #print((item1_embed==item2_embed).all())
            score1 = (user_embed * item1_embed).sum(dim=1)  # Score for item1
            score2 = (user_embed * item2_embed).sum(dim=1)  # Score for item2
    
            # Determine the correct positive and negative scores based on labels
            labels_binary = (labels == item1_ids).float()
            #print(labels_binary)
            pos_scores = torch.where(labels_binary == 1, score1, score2)
            neg_scores = torch.where(labels_binary == 1, score2, score1)
            #print(pos_scores)
            # Check if the model correctly ranked the positive item higher
            loss = criterion(user_embed,item1_ids, item1_embed,item2_ids, item2_embed, labels)
            val_loss += loss.item()
            predictions = pos_scores > neg_scores
    
            correct += predictions.sum().item()
            total += predictions.shape[0]
    
        val_accuracy = correct / total
        val_loss=val_loss/len(val_loader)
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f},Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")
        log.write(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.4f}\n")



# ======= Save Model =======
#torch.save(model.state_dict(), "trained_model.pth")
print("✅ Model Training Complete!")
with open(log_file, "a") as log:
    log.write("✅ Model Training Complete!\n")

🚀 Training Model...


Epoch 1/10: 100%|██████████| 15685/15685 [01:42<00:00, 153.45it/s]


Epoch 1: Train Loss = 0.6931,Val Loss = 0.6931, Val Accuracy = 0.4991


Epoch 2/10: 100%|██████████| 15685/15685 [01:41<00:00, 154.66it/s]


Epoch 2: Train Loss = 0.6928,Val Loss = 0.6935, Val Accuracy = 0.4997


Epoch 3/10: 100%|██████████| 15685/15685 [01:41<00:00, 154.09it/s]


Epoch 3: Train Loss = 0.6855,Val Loss = 0.7002, Val Accuracy = 0.4995


Epoch 4/10: 100%|██████████| 15685/15685 [01:42<00:00, 153.32it/s]


Epoch 4: Train Loss = 0.6699,Val Loss = 0.7147, Val Accuracy = 0.4995


Epoch 5/10: 100%|██████████| 15685/15685 [01:41<00:00, 154.58it/s]


Epoch 5: Train Loss = 0.6537,Val Loss = 0.7300, Val Accuracy = 0.4994


Epoch 6/10: 100%|██████████| 15685/15685 [01:41<00:00, 153.95it/s]


Epoch 6: Train Loss = 0.6401,Val Loss = 0.7449, Val Accuracy = 0.4994


Epoch 7/10: 100%|██████████| 15685/15685 [01:42<00:00, 153.76it/s]


Epoch 7: Train Loss = 0.6289,Val Loss = 0.7578, Val Accuracy = 0.4993


Epoch 8/10: 100%|██████████| 15685/15685 [01:42<00:00, 153.12it/s]


Epoch 8: Train Loss = 0.6196,Val Loss = 0.7687, Val Accuracy = 0.4993


Epoch 9/10: 100%|██████████| 15685/15685 [01:41<00:00, 154.64it/s]


Epoch 9: Train Loss = 0.6113,Val Loss = 0.7797, Val Accuracy = 0.4995


Epoch 10/10: 100%|██████████| 15685/15685 [01:41<00:00, 154.16it/s]


Epoch 10: Train Loss = 0.6034,Val Loss = 0.7908, Val Accuracy = 0.4995
✅ Model Training Complete!
