In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass

from data_loader import prepare_ml_pipeline, MovieLensData, MovieLensDataset
from matrix_factor import BiasedMF, train_mf
from debiasing import train_debiasing_model, DebiasingModel
from heater import train_heater, save_heater_embeddings
from evaluator import RecommenderEvaluator



In [2]:
def run_evaluation_pipeline(device: str = 'cuda' if torch.cuda.is_available() else 'cpu') -> RecommenderEvaluator:
    # Define K values for evaluation
    k_values = [5, 10, 20, 50]

    # 1. Load and prepare data
    print("Loading data...")
    ml_data, train_loader, valid_loader, test_loader = prepare_ml_pipeline(cold_start=True)
    evaluator = RecommenderEvaluator(ml_data)

    # 2. Train and evaluate base MF model
    print("\nTraining base MF model...")
    base_model = BiasedMF(ml_data.n_users, ml_data.n_items).to(device)
    base_model = train_mf(base_model, train_loader, num_epochs=20)
    
    print("\nEvaluating base MF model...")
    base_metrics = evaluator.evaluate_base_mf(
        model=base_model,
        data_loader=test_loader,
        k_values=k_values,
        device=device
    )
    
    print("\nBase MF Results:")
    for k in k_values:
        print(f"NDCG@{k}: {base_metrics[f'ndcg@{k}']:.4f}")

    # 3. Train and evaluate HEATER model
    print("\nTraining HEATER model...")
    heater = train_heater(ml_data, base_model, num_epochs=50)
    
    print("\nEvaluating HEATER model...")
    heater_metrics = evaluator.evaluate_heater(
        heater=heater,
        base_model=base_model,
        k_values=k_values,
        device=device,
        batch_size = 128
    )
    
    print("\nHEATER Results:")
    print("General Performance:")
    for k in k_values:
        print(f"NDCG@{k}: {heater_metrics[f'ndcg@{k}']:.4f}")
    
    print("\nCold-Start Performance:")
    for k in k_values:
        print(f"Cold-Start NDCG@{k}: {heater_metrics[f'cold_ndcg@{k}']:.4f}")

    # 4. Analyze popularity bias
    print("\nAnalyzing popularity bias...")
    bias_metrics = evaluator.analyze_popularity_bias(
        heater=heater,
        base_model=base_model,
        k=20,  # number of recommendations for bias analysis
        device=device
    )
    
    print("\nBias Analysis Results:")
    bias_labels = {
        'gini_coefficient': 'Gini Coefficient (lower is more equal)',
        'long_tail_coverage': 'Long-Tail Coverage',
        'popularity_correlation': 'Popularity Correlation'
    }
    for metric, value in bias_metrics.items():
        print(f"{bias_labels[metric]}: {value:.4f}")

    # 5. Plot comparisons
    print("\nGenerating comparison plots...")
    evaluator.plot_performance_comparison(
        k_values=k_values,
        save_path='evaluation_results.png'
    )
    print("Plots saved as 'evaluation_results.png'")

    return evaluator


In [None]:
def run_evaluation_pipeline(device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    """Complete evaluation pipeline including debiasing"""
    
    # Define evaluation parameters
    k_values = [5, 10, 20, 50]
    debiasing_params = {
        'model_select': [100],
        'alpha': 4.0,
        'batch_size': 50,
        'num_epochs': 1,
        'reg': 1e-5
    }
    
    # 1. Load and prepare data
    print("Loading data...")
    ml_data, train_loader, valid_loader, test_loader = prepare_ml_pipeline(cold_start=True)
    evaluator = RecommenderEvaluator(ml_data)

    # 2. Train and evaluate base model
    print("\nTraining base model...")
    base_model = BiasedMF(ml_data.n_users, ml_data.n_items).to(device)
    base_model = train_mf(base_model, train_loader, num_epochs=1)
    
    print("\nEvaluating base model...")
    base_metrics = evaluator.evaluate_base_mf(
        model=base_model,
        data_loader=test_loader,
        k_values=k_values,
        device=device
    )
    
    print("\nBase Model Results:")
    for k in k_values:
        print(f"NDCG@{k}: {base_metrics[f'ndcg@{k}']:.4f}")

    # 3. Train and evaluate HEATER model
    print("\nTraining HEATER model...")
    heater = train_heater(ml_data, base_model, num_epochs=1)
    
    print("\nEvaluating HEATER model...")
    heater_metrics = evaluator.evaluate_heater(
        heater=heater,
        base_model=base_model,
        k_values=k_values,
        device=device
    )
    
    print("\nHEATER Results:")
    for k in k_values:
        print(f"NDCG@{k}: {heater_metrics[f'ndcg@{k}']:.4f}")


    print("\nValidating data dimensions:")
    print(f"Number of users: {ml_data.n_users}")
    print(f"Number of items: {ml_data.n_items}")
    print(f"Max user index in training data: {ml_data.train_data['user_idx'].max()}")
    print(f"Max item index in training data: {ml_data.train_data['item_idx'].max()}")
    # 4. Train and evaluate debiasing model
    print("\nTraining debiasing model...")
    debiasing_model = train_debiasing_model(
        base_model=heater,
        ml_data=ml_data,
        **debiasing_params,
        device=device
    )
    
    print("\nEvaluating debiased model...")
    debiased_metrics = evaluator.evaluate_debiased(
        base_model=heater,  # or base_model
        debiasing_model=debiasing_model,
        k_values=k_values,
        device=device
    )
    
    print("\nDebiased Model Results:")
    for k in k_values:
        print(f"NDCG@{k}: {debiased_metrics[f'ndcg@{k}']:.4f}")

    # 5. Analyze popularity bias
    print("\nAnalyzing popularity bias...")
    bias_metrics = evaluator.analyze_popularity_bias(heater, base_model)
    print("\nBias Analysis Results:")
    for metric, value in bias_metrics.items():
        print(f"{metric}: {value:.4f}")

    # 6. Plot comparisons
    print("\nGenerating comparison plots...")
    evaluator.plot_performance_comparison(
        k_values=k_values,
        save_path='evaluation_results.png'
    )

    return evaluator

In [4]:
test = run_evaluation_pipeline()

Loading data...
Dataset loaded with cold_start=True:
Train: 692958 interactions
Valid: 94109 interactions
Test: 213142 interactions
Cold-start statistics:
Valid items not in train: 370
Test items not in train: 741
Identified 741 cold-start items out of 741 test items

Training base model...

Evaluating base model...

Base Model Results:
NDCG@5: 0.7673
NDCG@10: 0.7692
NDCG@20: 0.7743
NDCG@50: 0.7627

Training HEATER model...
Starting HEATER training...
Training data size: 692958
Batch size: 1024
Number of epochs: 1
Device: cpu
Epoch 1/1 - Batch 0/677 - Current Loss: 0.8617
Epoch 1/1 - Batch 100/677 - Current Loss: 0.2009
Epoch 1/1 - Batch 200/677 - Current Loss: 0.1009
Epoch 1/1 - Batch 300/677 - Current Loss: 0.0874
Epoch 1/1 - Batch 400/677 - Current Loss: 0.0794
Epoch 1/1 - Batch 500/677 - Current Loss: 0.0737
Epoch 1/1 - Batch 600/677 - Current Loss: 0.0695

Epoch 1/1 Summary:
Average Loss: 0.1408
Learning Rate: 0.000905
New best model saved!

Loaded best model state from training



IndexError: index 669 is out of bounds for dimension 0 with size 100