In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from pathlib import Path
from data_loader import prepare_ml_pipeline
from matrix_factor import BiasedMF, train_mf
from heater import train_heater, save_heater_embeddings
from item_debias.main import load_data, train
from dataclasses import dataclass



In [2]:
def setup_pipeline(data_path: str = "MovieLens1M", 
                  batch_size: int = 1024,
                  device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    """Set up and run the complete pipeline with proper error handling and logging"""
    
    # Set default dtype
    torch.set_default_dtype(torch.float32)
    
    # 1. Ensure data path exists
    data_dir = Path(data_path)
    if not data_dir.exists():
        raise FileNotFoundError(f"Data directory {data_path} not found")

    try:
        # 2. Load and prepare data
        print("Loading data...")
        ml_data, train_loader, valid_loader, test_loader = prepare_ml_pipeline(
            data_path=data_path,
            batch_size=batch_size
        )

        # 3. Train base model
        print("Training base model...")
        base_model = BiasedMF(
            num_users=ml_data.n_users,
            num_items=ml_data.n_items,
            embedding_dim=100,
            reg=0.0001
        ).to(device)
        
        trained_base = train_mf(
            model=base_model,
            train_loader=train_loader,
            num_epochs=1,
            lr=0.001,
            device=device
        )

        # 4. Train Heater
        print("Training Heater model...")
        heater = train_heater(
            ml_data=ml_data,
            base_model=trained_base,
            batch_size=batch_size,
            num_epochs=1,
            device=device
        )

        # 5. Save embeddings
        print("Saving Heater embeddings...")
        save_dir = Path(data_path)
        save_dir.mkdir(parents=True, exist_ok=True)
        save_heater_embeddings(
            heater=heater,
            ml_data=ml_data,
            base_model=trained_base,
            save_path=str(save_dir),
            device=device
        )

        return {
            'ml_data': ml_data,
            'base_model': trained_base,
            'heater_model': heater,
            'loaders': (train_loader, valid_loader, test_loader)
        }

    except Exception as e:
        print(f"Error in pipeline: {str(e)}")
        raise

In [3]:
def evaluate_model(model: nn.Module,
                  data_loader: torch.utils.data.DataLoader,
                  device: str) -> float:
    """
    Evaluate model on given data loader
    """
    model.eval()
    total_loss = 0
    criterion = nn.MSELoss()
    
    with torch.no_grad():
        for users, items, ratings in data_loader:
            users = users.to(device)
            items = items.to(device)
            ratings = ratings.to(device)
            
            output = model(users, items)
            loss = criterion(output.preds, ratings)
            total_loss += loss.item()
    
    return total_loss / len(data_loader)




In [4]:

torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
data = setup_pipeline()


Loading data...
Dataset loaded: 700146 train, 100021 validation, 200042 test
Sparsity: 0.04468
Training base model...
Training Heater model...
Saving Heater embeddings...
Saved embeddings to MovieLens1M
