In [None]:
import pandas as pd
import numpy as np
import torch
import pickle
import os
import json
import gc
from torch.distributions import Bernoulli
from torch.optim import LBFGS
from tqdm import tqdm
from scipy.stats import pearsonr
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Manager
import multiprocessing as mp

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from tueplots import bundles
bundles.icml2024()

from torchmetrics import AUROC
auroc = AUROC(task="binary")

import warnings
warnings.filterwarnings("ignore")

torch.manual_seed(0)

device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

def visualize_response_matrix(results, value, filename):
    # Extract the groups labels in the order of the columns
    group_values = results.columns.get_level_values("scenario")

    # Identify the boundaries where the group changes
    boundaries = []
    for i in range(1, len(group_values)):
        if group_values[i] != group_values[i - 1]:
            boundaries.append(i - 0.5)  # using 0.5 to place the line between columns

    # Visualize the results with a matrix: red is 0, white is -1 and blue is 1
    cmap = mcolors.ListedColormap(["white", "red", "blue"])
    bounds = [-1.5, -0.5, 0.5, 1.5]
    norm = mcolors.BoundaryNorm(bounds, cmap.N)

    # Calculate midpoints for each group label
    groups_list = list(group_values)
    group_names = []
    group_midpoints = []
    current_group = groups_list[0]
    start_index = 0
    for i, grp in enumerate(groups_list):
        if grp != current_group:
            midpoint = (start_index + i - 1) / 2.0
            group_names.append(current_group)
            group_midpoints.append(midpoint)
            current_group = grp
            start_index = i
    # Add the last group
    midpoint = (start_index + len(groups_list) - 1) / 2.0
    group_names.append(current_group)
    group_midpoints.append(midpoint)

    # Define the minimum spacing between labels (e.g., 100 units)
    min_spacing = 100
    last_label_pos = -float("inf")
    # Plot the matrix
    with plt.rc_context(bundles.icml2024(usetex=True, family="serif")):
        fig, ax = plt.subplots(figsize=(20, 10))
        cax = ax.matshow(value, aspect="auto", cmap=cmap, norm=norm)

        # Add vertical lines at each boundary
        for b in boundaries:
            ax.axvline(x=b, color="black", linewidth=0.25, linestyle="--", alpha=0.5)
        
        # Add group labels above the matrix, only if they're spaced enough apart
        for name, pos in zip(group_names, group_midpoints):
            if pos - last_label_pos >= min_spacing:
                ax.text(pos, -5, name, ha='center', va='bottom', rotation=90, fontsize=3)
                last_label_pos = pos

        # Add model labels on the y-axis
        ax.set_yticks(range(len(results.index)))
        ax.set_yticklabels(results.index, fontsize=3)

        # Add a colorbar
        cbar = plt.colorbar(cax)
        cbar.set_ticks([-1, 0, 1])
        cbar.set_ticklabels(["-1", "0", "1"])
        plt.savefig(filename, dpi=600, bbox_inches="tight")
        plt.close()

def trainer(parameters, optim, closure, n_iter=100, verbose=True):
    pbar = tqdm(range(n_iter)) if verbose else range(n_iter)
    for iteration in pbar:
        if iteration > 0:
            previous_parameters = [p.clone() for p in parameters]
            previous_loss = loss.clone()
        
        loss = optim.step(closure)
        
        if iteration > 0:
            d_loss = (previous_loss - loss).item()
            d_parameters = sum(
                torch.norm(prev - curr, p=2).item()
                for prev, curr in zip(previous_parameters, parameters)
            )
            grad_norm = sum(torch.norm(p.grad, p=2).item() for p in parameters if p.grad is not None)
            if verbose:
                pbar.set_postfix({"grad_norm": grad_norm, "d_parameter": d_parameters, "d_loss": d_loss})
            
            if d_loss < 1e-5 and d_parameters < 1e-5 and grad_norm < 1e-5:
                break
    return parameters

def compute_auc(probs, data, train_idtor, test_idtor):
    train_probs = probs[train_idtor.bool()]
    test_probs = probs[test_idtor.bool()]
    train_labels = data[train_idtor.bool()]
    test_labels = data[test_idtor.bool()]
    train_auc = auroc(train_probs, train_labels)
    test_auc = auroc(test_probs, test_labels)
    print(f"train auc: {train_auc}")
    print(f"test auc: {test_auc}")
    
    return train_auc, test_auc

def compute_cttcorr(probs, data, train_idtor, test_idtor):
    train_probs  = probs.clone()
    test_probs   = probs.clone()
    train_labels = data.clone()
    test_labels  = data.clone()

    train_mask = ~train_idtor.bool()
    train_probs[train_mask]  = float('nan')
    train_labels[train_mask] = float('nan')

    test_mask = ~test_idtor.bool()
    test_probs[test_mask]   = float('nan')
    test_labels[test_mask]  = float('nan')
    
    train_prob_ctt = torch.nanmean(train_probs, dim=1).detach().cpu().numpy()
    train_label_ctt = torch.nanmean(train_labels, dim=1).detach().cpu().numpy()
    train_mask = ~np.isnan(train_prob_ctt) & ~np.isnan(train_label_ctt)
    train_cttcorr = pearsonr(train_prob_ctt[train_mask], train_label_ctt[train_mask]).statistic
    
    test_prob_ctt = torch.nanmean(test_probs, dim=1).detach().cpu().numpy()
    test_label_ctt = torch.nanmean(test_labels, dim=1).detach().cpu().numpy()
    test_mask = ~np.isnan(test_prob_ctt) & ~np.isnan(test_label_ctt)
    test_cttcorr = pearsonr(test_prob_ctt[test_mask], test_label_ctt[test_mask]).statistic
    
    print(f"train cttcorr: {train_cttcorr}")
    print(f"test cttcorr: {test_cttcorr}")

    return train_cttcorr, test_cttcorr

In [7]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.special import expit # Numerically stable sigmoid function
from scipy.stats import pearsonr

# --- PyTorch and Metrics Imports (from your reference code) ---
import torch
from torchmetrics import AUROC
auroc = AUROC(task="binary")
# --- End of Imports ---
# ===================================================================
# == Step 1: Load Data and Create Train/Test Split
# ===================================================================
print("Loading response matrix...")
resmat = pd.read_pickle("../data/resmat_2000.pkl")

Loading response matrix...


In [8]:
# Find the locations (row, col indices) of all non-missing values
non_nan_indices = np.argwhere(resmat.notna().values)

# Randomly shuffle these indices
np.random.seed(42)
np.random.shuffle(non_nan_indices)

# Decide on the split size
test_size = int(len(non_nan_indices) * 0.20)
test_indices = non_nan_indices[:test_size]
train_indices = non_nan_indices[test_size:]

# Create the training matrix by hiding the test data
train_resmat = resmat.copy()
test_rows, test_cols = train_resmat.values.shape[0], train_resmat.values.shape[1]
train_resmat.values[test_indices[:, 0], test_indices[:, 1]] = np.nan

print(f"Split data into {len(train_indices)} train samples and {len(test_indices)} test samples.")

# Impute the training data for SVD
imputed_train_resmat = train_resmat.fillna(0)

# ===================================================================
# == Step 2: Train SVD Model and Get Predictions
# ===================================================================
OPTIMAL_K = 8
print(f"\nTraining SVD model with k={OPTIMAL_K}...")

svd = TruncatedSVD(n_components=OPTIMAL_K, random_state=42)
svd.fit(imputed_train_resmat)

# Reconstruct the full matrix and convert to probabilities
reconstructed_matrix = svd.inverse_transform(svd.transform(imputed_train_resmat))
probs_matrix_np = expit(reconstructed_matrix)

# ===================================================================
# == Step 3: Prepare Data for Evaluation (NumPy and PyTorch)
# ===================================================================
# Ground truth data as a numpy array
data_np = resmat.fillna(0).values

# Create boolean masks (numpy)
train_idtor_np = np.zeros_like(data_np, dtype=bool)
test_idtor_np = np.zeros_like(data_np, dtype=bool)
train_idtor_np[train_indices[:, 0], train_indices[:, 1]] = True
test_idtor_np[test_indices[:, 0], test_indices[:, 1]] = True

# --- NEW: Convert NumPy arrays to PyTorch tensors for compatibility ---
print("\nConverting results to PyTorch Tensors for evaluation...")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

probs_tensor = torch.tensor(probs_matrix_np, dtype=torch.float32, device=device)
data_tensor = torch.tensor(data_np, dtype=torch.float32, device=device)
train_idtor_tensor = torch.tensor(train_idtor_np, dtype=torch.int, device=device)
test_idtor_tensor = torch.tensor(test_idtor_np, dtype=torch.int, device=device)
# --- End of Conversion ---

print("\n--- Running Final Evaluations ---")
compute_auc(probs_tensor, data_tensor, train_idtor_tensor, test_idtor_tensor)
compute_cttcorr(probs_tensor, data_tensor, train_idtor_tensor, test_idtor_tensor)

Split data into 910536 train samples and 227634 test samples.

Training SVD model with k=8...

Converting results to PyTorch Tensors for evaluation...

--- Running Final Evaluations ---
train auc: 0.9263309240341187
test auc: 0.8913356065750122
train cttcorr: 0.9637015461921692
test cttcorr: 0.9530106782913208


(np.float32(0.96370155), np.float32(0.9530107))