In [1]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import scipy

from data_loader import prepare_ml_pipeline
from matrix_factor import BiasedMF, train_mf
from debiasing import train_debiasing_model
from dropoutnet import train_dropoutnet
from evaluator import ndcg_calc_sampled, ndcg_calc_dropout_sampled, ndcg_calc_debiased_sampled, evaluate_split
import evaluator as ev



In [2]:
def analyze_debiasing_transformation(model, R, cold_items=None):
    """
    Analyze how the debiasing model transforms ratings
    
    Args:
        model: Debiasing model (or None for pre-debiasing analysis)
        R: Input rating matrix (items x users)
        cold_items: Optional mask/indices of cold items
    """
    with torch.no_grad():
        # Convert inputs to numpy
        R_orig = R.cpu().numpy()
        
        # If we have a debiasing model, get transformed ratings
        if model is not None:
            model.eval()
            R_debiased = model(R, is_training=False).preds
            R_deb = R_debiased.cpu().numpy()
        else:
            R_deb = R_orig
        
        # Overall distribution
        print("\nRating Distribution:")
        print(f"Original - Mean: {R_orig.mean():.4f}, Std: {R_orig.std():.4f}")
        if model is not None:
            print(f"Debiased - Mean: {R_deb.mean():.4f}, Std: {R_deb.std():.4f}")
        
        # Analyze by rating strength
        orig_percentiles = np.percentile(R_orig, [25, 50, 75])
        print("\nRating Strength Analysis:")
        print("Original Quartiles:", orig_percentiles)
        
        # How ratings are distributed
        low_mask = R_orig <= orig_percentiles[0]
        mid_mask = (R_orig > orig_percentiles[0]) & (R_orig <= orig_percentiles[2])
        high_mask = R_orig > orig_percentiles[2]
        
        print("\nRating Distribution by Strength:")
        print(f"Low ratings: mean={R_orig[low_mask].mean():.4f}")
        print(f"Mid ratings: mean={R_orig[mid_mask].mean():.4f}")
        print(f"High ratings: mean={R_orig[high_mask].mean():.4f}")
        
        if model is not None:
            print("\nTransformation by Rating Strength:")
            print(f"Low ratings: {R_orig[low_mask].mean():.4f} -> {R_deb[low_mask].mean():.4f}")
            print(f"Mid ratings: {R_orig[mid_mask].mean():.4f} -> {R_deb[mid_mask].mean():.4f}")
            print(f"High ratings: {R_orig[high_mask].mean():.4f} -> {R_deb[high_mask].mean():.4f}")
        
        # Analyze cold items if provided
        if cold_items is not None:
            print("\nCold Item Analysis:")
            cold_orig = R_orig[cold_items]
            cold_deb = R_deb[cold_items]
            print(f"Original - Mean: {cold_orig.mean():.4f}, Std: {cold_orig.std():.4f}")
            if model is not None:
                print(f"Debiased - Mean: {cold_deb.mean():.4f}, Std: {cold_deb.std():.4f}")
            
            # Calculate average ranks for cold items
            if model is not None:
                cold_rank_changes = []
                n_users = R_orig.shape[1]
                
                # Calculate ranks per user
                for user_idx in range(n_users):
                    # Get rankings for this user
                    orig_user_ranks = np.argsort(-R_orig[:, user_idx])
                    deb_user_ranks = np.argsort(-R_deb[:, user_idx])
                    
                    # Find ranks of cold items
                    cold_indices = np.where(cold_items)[0]
                    for item in cold_indices:
                        orig_rank = np.where(orig_user_ranks == item)[0][0]
                        deb_rank = np.where(deb_user_ranks == item)[0][0]
                        # Convert to percentile (0-1 scale)
                        orig_percentile = orig_rank / len(orig_user_ranks)
                        deb_percentile = deb_rank / len(deb_user_ranks)
                        cold_rank_changes.append(orig_percentile - deb_percentile)
                
                avg_rank_change = np.mean(cold_rank_changes)
                improved_count = np.sum(np.array(cold_rank_changes) > 0)
                
                print(f"\nCold Item Rank Changes:")
                print(f"Average rank improvement: {avg_rank_change:.4f}")
                print(f"Items with improved ranks: {improved_count} / {len(cold_rank_changes)}")
                print(f"Average percentile before: {1 - np.mean([c[0] for c in cold_rank_changes]):.4f}")
                print(f"Average percentile after: {1 - np.mean([c[1] for c in cold_rank_changes]):.4f}")
        
        # Return statistics
        stats = {
            'original_stats': {
                'mean': R_orig.mean(),
                'std': R_orig.std(),
                'percentiles': orig_percentiles
            }
        }
        
        if model is not None:
            stats['debiased_stats'] = {
                'mean': R_deb.mean(),
                'std': R_deb.std()
            }
            
        if cold_items is not None:
            stats['cold_stats'] = {
                'original_mean': cold_orig.mean(),
                'original_std': cold_orig.std()
            }
            if model is not None:
                stats['cold_stats'].update({
                    'debiased_mean': cold_deb.mean(),
                    'debiased_std': cold_deb.std(),
                    'mean_change': cold_deb.mean() - cold_orig.mean(),
                    'rank_changes': cold_rank_changes if model is not None else None
                })
        
        return stats

def check_debiasing(dropout_model, original_mf, debiasing_model, ml_data, device):
    """Run debiasing transformation analysis"""
    print("\nAnalyzing Rating Distributions...")
    
    # Get cold items
    cold_items, _ = ev.get_item_split(ml_data.train_data, ml_data.test_data)
    cold_mask = torch.zeros(ml_data.n_items, dtype=torch.bool, device=device)
    cold_mask[list(cold_items)] = True
    
    # Get base predictions
    with torch.no_grad():
        # Get embeddings and content
        u_emb, i_emb = original_mf.get_embeddings()
        u_emb = u_emb.to(device)
        i_emb = i_emb.to(device)
        
        u_content = (torch.tensor(ml_data.user_content, dtype=torch.float32).to(device) 
                    if ml_data.user_content is not None else None)
        i_content = (torch.tensor(ml_data.item_content, dtype=torch.float32).to(device)
                    if ml_data.item_content is not None else None)
        
        # Get DropoutNet predictions
        u_encoded, i_encoded = dropout_model.encode(
            u_emb,
            i_emb,
            u_content,
            i_content
        )
        R = torch.mm(i_encoded, u_encoded.t())
        
        # Add bias terms
        all_users = torch.arange(ml_data.n_users, device=device)
        all_items = torch.arange(ml_data.n_items, device=device)
        user_biases = original_mf.user_bias(all_users).squeeze()
        item_biases = original_mf.item_bias(all_items).squeeze()
        R += user_biases.unsqueeze(0) + item_biases.unsqueeze(1) + original_mf.global_bias
    
    # Analyze distributions
    stats = analyze_debiasing_transformation(debiasing_model, R, cold_mask)
    
    return stats

In [3]:
def normalize_ndcg_by_positives(ndcg_scores, n_positives, avg_positives):

    norm_factor = avg_positives / n_positives  # will be < 1 for cold users (more positives), > 1 for warm users (fewer positives)
    return [score * norm_factor for score in ndcg_scores]

def analyze_positive_distribution(ml_data, cold_users, warm_users):
    """
    Analyze the distribution of positive ratings between train and test sets
    for both cold and warm users
    """
    train_positives_warm = (ml_data.train_data['user_idx'].isin(warm_users) & (ml_data.train_data['rating'] >= 4)).sum()
    test_positives_warm = (ml_data.test_data['user_idx'].isin(warm_users) & (ml_data.test_data['rating'] >= 4)).sum()
    total_positives_warm = train_positives_warm + test_positives_warm

    train_positives_cold = (ml_data.train_data['user_idx'].isin(cold_users) & (ml_data.train_data['rating'] >= 4)).sum()
    test_positives_cold = (ml_data.test_data['user_idx'].isin(cold_users) & (ml_data.test_data['rating'] >= 4)).sum()
    total_positives_cold = train_positives_cold + test_positives_cold

    print("\nPositive Ratings Distribution Analysis:")
    print("-" * 40)
    print(f"Warm Users:")
    print(f"  Train positives: {train_positives_warm}")
    print(f"  Test positives: {test_positives_warm}")
    print(f"  Total positives: {total_positives_warm}")
    print(f"  Ratio in test: {test_positives_warm/total_positives_warm:.2%}")
    print(f"  Average per user: {total_positives_warm/len(warm_users):.2f}")

    print(f"\nCold Users:")
    print(f"  Train positives: {train_positives_cold}")
    print(f"  Test positives: {test_positives_cold}")
    print(f"  Total positives: {total_positives_cold}")
    print(f"  Ratio in test: {test_positives_cold/total_positives_cold:.2%}")
    print(f"  Average per user: {total_positives_cold/len(cold_users):.2f}")

    return {
        'warm': {
            'train': train_positives_warm,
            'test': test_positives_warm,
            'total': total_positives_warm,
            'test_ratio': test_positives_warm/total_positives_warm
        },
        'cold': {
            'train': train_positives_cold,
            'test': test_positives_cold,
            'total': total_positives_cold,
            'test_ratio': test_positives_cold/total_positives_cold
        }
    }


def run_evaluation_pipeline(device: str = 'cuda' if torch.cuda.is_available() else 'cpu'):
    """Complete evaluation pipeline with DropoutNet and debiasing"""
    
    # Define evaluation parameters
    k_values = [15,30]
    

    dropoutnet_params = {
        'model_select': [800, 400],
        'rank_out': 200,
        'dropout_rate': 0.5,
        'batch_size': 1000,
        'n_scores_per_user': 2500,
        'data_batch_size': 1000,
        'max_data_per_step': 50000,
        'num_epochs': 1,
        'learning_rate': 0.001
    }
    
    #Params from https://github.com/Zziwei/Fairness-in-Cold-Start-Recommendation/blob/main/Scale/main.py
    debiasing_params = {
        'model_select': [100],
        'alpha': 1000.0,
        'batch_size': 50,
        'num_epochs': 1,
        'reg': 0.000001
    }
    
    # 1. Load and prepare data
    print("Loading data...")
    ml_data, train_loader, valid_loader, test_loader = prepare_ml_pipeline(cold_start=True)
  

    # Get cold/warm user split
    cold_users, warm_users = ev.get_user_split(ml_data.train_data, ml_data.test_data)
    cold_items, warm_items = ev.get_item_split(ml_data.train_data, ml_data.test_data)
    
    # Analyze positive distribution
    #distribution_stats = analyze_positive_distribution(ml_data, cold_users, warm_users)
    
    # Calculate normalized NDCG
    cold_positives = len(ml_data.test_data[ml_data.test_data['user_idx'].isin(cold_users)]['rating'] >= 4)
    warm_positives = len(ml_data.test_data[ml_data.test_data['user_idx'].isin(warm_users)]['rating'] >= 4)
    avg_positives = (cold_positives + warm_positives) / 2
    
    
    # 2. Train and evaluate base model
    print("\nTraining base model...")
    base_model = BiasedMF(ml_data.n_users, ml_data.n_items).to(device)
    base_model = train_mf(model = base_model, train_loader= train_loader, val_loader= valid_loader, ml_data= ml_data, num_epochs=1, lr = .01)
    
    base_ndcgs, base_prec, base_recall = ndcg_calc_sampled(base_model, test_loader, ml_data, k_values=k_values)
    final_mdg, mdg_anal = ev.mdg_calc_base(base_model, test_loader, ml_data)
    print(f"Base NDCGS: {base_ndcgs}")
    print(f"Final MDG: {mdg_anal['all']['mean']}, min10: {mdg_anal['bottom_10']['mean']}, min20: {mdg_anal['bottom_20']['mean']}, top10: {mdg_anal['top_10']['mean']}")
    base_cold_warm = ev.analyze_mdg_with_splits(final_mdg, cold_items, warm_items)
    print("\nBase Model MDG Analysis:")
    ev.print_mdg_analysis(base_cold_warm)  
    
    print("\nEvaluating base model...")
    base_metrics = evaluate_split(
        eval_model=base_model,
        test_loader=test_loader,
        ml_data=ml_data,
        cold_users=cold_users,
        warm_users=warm_users,
        k_values=k_values,
        evaluation_func=ndcg_calc_sampled
    )
    
    print("\nBase Model Results:")
    print(f"Base NDCGs: {base_ndcgs}")
    print("\nBase Model Results:")
    print(f"Cold Users (n={base_metrics.n_cold_users}):")
    #normalized_cold_ndcgs = normalize_ndcg_by_positives(base_metrics.cold_users.ndcg, cold_positives, avg_positives)
    print(f"  Original NDCG@{k_values}: {base_metrics.cold_users.ndcg}")
    #print(f"  Normalized NDCG@{k_values}: {normalized_cold_ndcgs}")
    
    print(f"Warm Users (n={base_metrics.n_warm_users}):")
    #normalized_warm_ndcgs = normalize_ndcg_by_positives(base_metrics.warm_users.ndcg, warm_positives, avg_positives)
    print(f"  Original NDCG@{k_values}: {base_metrics.warm_users.ndcg}")
    #print(f"  Normalized NDCG@{k_values}: {normalized_warm_ndcgs}")

    
    #3. Train and evaluate DropoutNet model
    print("\nTraining DropoutNet model...")
    dropoutnet = train_dropoutnet(
        ml_data=ml_data,
        base_model=base_model,
        val_loader=valid_loader,
        test_loader=test_loader,
        **dropoutnet_params,
        device=device
    )

    print("\nEvaluating DropoutNet model...")
    dropout_ndcgs, drop_mdg, drop_mdg_anal = ndcg_calc_dropout_sampled(base_model, dropoutnet, test_loader, ml_data, k_values = k_values)
    drop_mdg, drop_mdg_anal = ev.mdg_calc_dropout(dropoutnet, base_model, test_loader,ml_data)
    print(f"Dropout NDCGs {dropout_ndcgs}")
    print(f"Final MDG: {drop_mdg_anal['all']['mean']}, min10: {drop_mdg_anal['bottom_10']['mean']}, min20: {drop_mdg_anal['bottom_20']['mean']}, top10: {drop_mdg_anal['top_10']['mean']}")
     # Analyze cold vs warm for dropout model
    dropout_cold_warm = ev.analyze_mdg_with_splits(drop_mdg, cold_items, warm_items)

    print("\nDebiased Model MDG Analysis:")
    ev.print_mdg_analysis(dropout_cold_warm)
    
    print("\nEvaluating DropoutNet model...")
    dropout_metrics = evaluate_split(
        eval_model=dropoutnet,  # Base model needed for embeddings
        test_loader=test_loader,
        ml_data=ml_data,
        cold_users=cold_users,
        warm_users=warm_users,
        k_values=k_values,
        evaluation_func=ndcg_calc_dropout_sampled,
        base_model=base_model  # Changed parameter name to avoid conflict
    )
    
    print("\nDropoutNet Results:")
    print(f"Dropout NDCGs: {dropout_ndcgs}")
    print(f"Cold Users (n={dropout_metrics.n_cold_users}):")
    #normalized_cold_ndcgs = normalize_ndcg_by_positives(dropout_metrics.cold_users.ndcg, cold_positives, avg_positives)
    print(f"  Original NDCG@{k_values}: {dropout_metrics.cold_users.ndcg}")
    #print(f"  Normalized NDCG@{k_values}: {normalized_cold_ndcgs}")
    
    print(f"Warm Users (n={dropout_metrics.n_warm_users}):")
    #normalized_warm_ndcgs = normalize_ndcg_by_positives(dropout_metrics.warm_users.ndcg, warm_positives, avg_positives)
    print(f"  Original NDCG@{k_values}: {dropout_metrics.warm_users.ndcg}")
    #print(f"  Normalized NDCG@{k_values}: {normalized_warm_ndcgs}")
    
    # 4. Train and evaluate debiasing model
    print("\nTraining debiasing model...")
    debiasing_model = train_debiasing_model(
        base_model=dropoutnet,
        original_mf=base_model,
        ml_data=ml_data,
        **debiasing_params,
        device=device
    )
    
    print("\nEvaluating debiased model...")
    debiased_ndcgs, debiased_prec, debiased_rec = ndcg_calc_debiased_sampled(dropoutnet, base_model, debiasing_model,test_loader,ml_data,k_values=k_values)
    debiased_mdg, debiased_mdg_anal = ev.mdg_calc_debiased(dropoutnet, base_model, debiasing_model, test_loader, ml_data)
    print(f"Debiased NDCGs {debiased_ndcgs}")
    print(f"Final MDG: {debiased_mdg_anal['all']['mean']}, min10: {debiased_mdg_anal['bottom_10']['mean']}, min20: {debiased_mdg_anal['bottom_20']['mean']}, top10: {debiased_mdg_anal['top_10']['mean']}")
    # Analyze cold vs warm for debiased model
    debiased_cold_warm = ev.analyze_mdg_with_splits(
        debiased_mdg,
        cold_items, warm_items
    )

    print("\nDebiased Model MDG Analysis:")
    ev.print_mdg_analysis(debiased_cold_warm)

    print("\nEvaluating debiased model...")
    debiased_metrics = evaluate_split(
        eval_model=debiasing_model,  # Base model for embeddings
        test_loader=test_loader,
        ml_data=ml_data,
        cold_users=cold_users,
        warm_users=warm_users,
        k_values=k_values,
        evaluation_func=ndcg_calc_debiased_sampled,
        prior_model=dropoutnet,
        original_mf=base_model
    )
    
    print("\nDebiased Model Results:")
    print(f"Debiased NDCGs: {debiased_ndcgs}")
    print(f"Cold Users (n={debiased_metrics.n_cold_users}):")
    #normalized_cold_ndcgs = normalize_ndcg_by_positives(debiased_metrics.cold_users.ndcg, cold_positives, avg_positives)
    print(f"  Original NDCG@{k_values}: {debiased_metrics.cold_users.ndcg}")
    #print(f"  Normalized NDCG@{k_values}: {normalized_cold_ndcgs}")
    
    print(f"Warm Users (n={debiased_metrics.n_warm_users}):")
    #normalized_warm_ndcgs = normalize_ndcg_by_positives(debiased_metrics.warm_users.ndcg, warm_positives, avg_positives)
    print(f"  Original NDCG@{k_values}: {debiased_metrics.warm_users.ndcg}")
    #print(f"  Normalized NDCG@{k_values}: {normalized_warm_ndcgs}")
    
    return {
        'base': base_metrics,
        'dropout': dropout_metrics,
        'debiased': debiased_metrics
    }
   

In [4]:
test = run_evaluation_pipeline()

Loading data...
Dataset loaded with cold_start=True:
Train: 692958 interactions
Valid: 94109 interactions
Test: 213142 interactions
Cold-start statistics:
Valid items not in train: 370
Test items not in train: 741
Found 0 cold users and 6037 warm users in test set
Cold user ratio: 0.00%

Item Split Analysis:
Total items in test set: 741
Found 741 cold items and 0 warm items
Cold item ratio: 100.00%

Detailed Statistics:
Total unique items in training: 2595
Total unique items in test: 741
Items in test but not in training: 741
Items with insufficient interactions: 0

Interaction Statistics:
Average interactions per cold item: 287.64
Average interactions per warm item: nan

Training base model...
Epoch 1 - Avg Loss: 1.7384 - Avg Train NDCG: 0.6926 - Avg Test NDCG: 0.0755
Epoch 1 - Avg Test Prec: 0.0469 - Avg Test Rec: 0.11290561857830284
Base NDCGS: [0.1102781017372421, 0.1328697018896751]
Final MDG: 0.22637007809200388, min10: 0.13184730397512892, min20: 0.16022255082586745, top10: 0.31

  Uin = torch.tensor(u_emb_expanded[batch_u_idx], device=device)
  Vin = torch.tensor(i_emb_expanded[batch_i_idx], device=device)


Epoch 1/1: Average Loss = 0.0494

Training completed!

Evaluating DropoutNet model...
Dropout NDCGs [0.05212612850917899, 0.08856699570013318]
Final MDG: 0.08902576936848584, min10: 0.008736681148282802, min20: 0.019503406188051967, top10: 0.19193894259437877

Debiased Model MDG Analysis:

MDG Analysis Summary:
--------------------------------------------------

Coverage Statistics:
Total items with MDG scores: 741
Cold items with scores: 741
Warm items with scores: 0
Cold items missing scores: 0
Warm items missing scores: 0

OVERALL Items:
Number of items: 741
Mean MDG: 0.0890
Median MDG: 0.0837
Std Dev: 0.0546

COLD Items:
Number of items: 741
Mean MDG: 0.0890
Median MDG: 0.0837
Std Dev: 0.0546

Evaluating DropoutNet model...
No cold users found, skipping evaluation
Evaluating warm users...
Evaluating all users...

DropoutNet Results:
Dropout NDCGs: [0.05212612850917899, 0.08856699570013318]
Cold Users (n=0):
  Original NDCG@[15, 30]: [0.0, 0.0]
Warm Users (n=6037):
  Original NDCG@[