In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import pickle
from scipy.sparse import load_npz

import nbimporter
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import optuna
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
current_dir = Path.cwd()
encoded_dir = current_dir.parent / "data" / "encoded"
user_item_dir = current_dir.parent / "data" / "pre_process"
save_dir = current_dir.parent / "models" 

all_encoded_data = encoded_dir / "item_metadata_text_image.pt"
images_embeddings = encoded_dir / "images_encodings.pkl"

In [4]:
text_embeddings = torch.load(all_encoded_data)
text_embeddings

{131488: tensor([-0.0118, -0.0987, -0.2878,  ...,  0.4795,  0.0573,  0.1697],
        device='cuda:0'),
 49667: tensor([-0.0573, -0.1541, -0.4666,  ...,  0.8726,  0.3153,  0.4160],
        device='cuda:0'),
 13509: tensor([-0.0399, -0.1226, -0.3553,  ...,  0.0951,  0.4708,  0.4165],
        device='cuda:0'),
 98102: tensor([-0.0154, -0.0735, -0.3964,  ...,  0.0055,  0.1146,  0.1761],
        device='cuda:0'),
 44993: tensor([ 0.0026, -0.0701, -0.3936,  ...,  0.4951,  0.8374,  0.2038],
        device='cuda:0'),
 1965: tensor([ 0.0068, -0.0461, -0.3463,  ...,  0.0868,  0.2815,  1.1967],
        device='cuda:0'),
 92911: tensor([-0.0280, -0.1065, -0.4042,  ...,  0.0997,  0.2180,  0.1084],
        device='cuda:0'),
 143124: tensor([ 0.0432, -0.0803, -0.3857,  ...,  0.5067,  1.7926,  0.4334],
        device='cuda:0'),
 111847: tensor([ 0.0402, -0.0127, -0.5208,  ...,  0.1677,  0.0676,  0.2142],
        device='cuda:0'),
 4596: tensor([ 2.6416e-03, -9.5867e-02, -3.2790e-01,  ...,  6.5514e-01

In [None]:
# We had also built an AE only for the images
with open(images_embeddings, 'rb') as f:
    images_embeddings = pickle.load(f)
images_embeddings

In [5]:
len(text_embeddings[90788])

3075

In [6]:
len(text_embeddings)

198771

# Train the Auto encoder

In [17]:
latent_dim = 128
num_epochs = 70
input_dim = 3075

## ================== 1. Data Preparation ==================

In [19]:
class EmbeddingsDataset(Dataset):
    def __init__(self, embeddings_dict):
        self.item_ids = list(embeddings_dict.keys())
        self.embeddings = torch.stack([embeddings_dict[item_id] for item_id in self.item_ids])
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx]

dataset = EmbeddingsDataset(text_embeddings)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

## ============= 2. Define Autoencoder Model ===============

In [20]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim=input_dim, latent_dim=latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, latent_dim),
            nn.ReLU())
        
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, input_dim))
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [21]:
# Initialize model
model = Autoencoder().to(device)

## ================== 3. Training Setup ==================

In [22]:
wandb.init(project="autoencoder_training", name=f"all_data_embeddings_{latent_dim}")
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
best_test_loss = float("inf")
best_model_state = None

## ============ 4. Training Loop with W&B Logging ===============

In [23]:
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    total_train_samples = 0
    for batch in train_dataloader:
        batch = batch.to(device)
        optimizer.zero_grad()
        reconstructed = model(batch)
        loss = criterion(reconstructed, batch)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item() * batch.size(0)
        total_train_samples += batch.size(0)
    
    avg_train_loss = total_train_loss / total_train_samples
    
    # Evaluate on test set
    model.eval()
    total_test_loss = 0
    total_test_samples = 0
    with torch.no_grad():
        for batch in test_dataloader:
            batch = batch.to(device)
            reconstructed = model(batch)
            loss = criterion(reconstructed, batch)
            total_test_loss += loss.item() * batch.size(0)
            total_test_samples += batch.size(0)
    
    avg_test_loss = total_test_loss / total_test_samples
    
    # Track best model
    if avg_test_loss < best_test_loss:
        best_test_loss = avg_test_loss
        best_model_state = model.state_dict()
    
    wandb.log({"Train Loss": avg_train_loss, "Test Loss": avg_test_loss})
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}")

wandb.finish()

Epoch [1/70], Train Loss: 0.0546, Test Loss: 0.0332
Epoch [2/70], Train Loss: 0.0291, Test Loss: 0.0264
Epoch [3/70], Train Loss: 0.0256, Test Loss: 0.0247
Epoch [4/70], Train Loss: 0.0242, Test Loss: 0.0237
Epoch [5/70], Train Loss: 0.0233, Test Loss: 0.0232
Epoch [6/70], Train Loss: 0.0228, Test Loss: 0.0227
Epoch [7/70], Train Loss: 0.0224, Test Loss: 0.0223
Epoch [8/70], Train Loss: 0.0221, Test Loss: 0.0223
Epoch [9/70], Train Loss: 0.0218, Test Loss: 0.0219
Epoch [10/70], Train Loss: 0.0216, Test Loss: 0.0218
Epoch [11/70], Train Loss: 0.0215, Test Loss: 0.0216
Epoch [12/70], Train Loss: 0.0214, Test Loss: 0.0214
Epoch [13/70], Train Loss: 0.0212, Test Loss: 0.0218
Epoch [14/70], Train Loss: 0.0212, Test Loss: 0.0214
Epoch [15/70], Train Loss: 0.0211, Test Loss: 0.0214
Epoch [16/70], Train Loss: 0.0210, Test Loss: 0.0211
Epoch [17/70], Train Loss: 0.0209, Test Loss: 0.0214
Epoch [18/70], Train Loss: 0.0209, Test Loss: 0.0210
Epoch [19/70], Train Loss: 0.0208, Test Loss: 0.0212
Ep

0,1
Test Loss,█▄▃▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train Loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Test Loss,0.02027
Train Loss,0.01998


## ============= 5. Save Compressed Representations ================

In [24]:
# Load best model
model.load_state_dict(best_model_state)
model.eval()
compressed_representations = {}
with torch.no_grad():
    for item_id, embedding in text_embeddings.items():
        embedding = embedding.to(device)
        compressed_representations[item_id] = model.encoder(embedding.unsqueeze(0)).cpu().squeeze(0)

# Save compressed embeddings
compressed_embeddings_path = encoded_dir / f"compressed_all_data_encodings_{latent_dim}.pkl"
with open(compressed_embeddings_path, 'wb') as f:
    pickle.dump(compressed_representations, f)

print(f"Compressed embeddings saved at {compressed_embeddings_path}")

Compressed embeddings saved at /storage/yahlly/RecSys/data/encoded/compressed_all_data_encodings_128.pkl
