In [2]:
import pandas as pd
import numpy as np
import json

def main():
    # 1. Configuration
    embeddings_file = '../data/ERBB2_validation_embeddings_stratify.tsv'
    metadata_file = '../data/ERBB2_validation_metadata_stratify.tsv'
    output_file = '../data/ERBB2_slide_centroid_data.json'
    
    print("Loading data...")
    
    # 2. Load Data
    # Loading embeddings: Assuming no header, just raw feature matrix
    try:
        df_embeddings = pd.read_csv(embeddings_file, sep='\t', header=None)
        print(f"Embeddings loaded: {df_embeddings.shape}")
    except FileNotFoundError:
        print(f"Error: Could not find {embeddings_file}")
        return

    # Loading metadata: Has a header (slide_id, targets, etc.)
    try:
        df_metadata = pd.read_csv(metadata_file, sep='\t')
        print(f"Metadata loaded: {df_metadata.shape}")
    except FileNotFoundError:
        print(f"Error: Could not find {metadata_file}")
        return

    # 3. Validation
    # Ensure row counts match (Assuming 1-to-1 mapping based on row order)
    if len(df_embeddings) != len(df_metadata):
        print(f"WARNING: Row count mismatch! Embeddings: {len(df_embeddings)}, Metadata: {len(df_metadata)}")
        # We process only up to the minimum length to avoid crashing
        min_len = min(len(df_embeddings), len(df_metadata))
        df_embeddings = df_embeddings.iloc[:min_len]
        df_metadata = df_metadata.iloc[:min_len]
    
    # 4. Supervised Clustering (Calculate Centroids)
    # We join them temporarily to group by target
    print("Calculating centroids based on targets...")
    
    # Get unique targets
    unique_targets = df_metadata['targets'].unique()
    centroids = {}

    for target_class in unique_targets:
        # Find indices where the target matches
        indices = df_metadata[df_metadata['targets'] == target_class].index
        
        # Extract corresponding embeddings
        class_embeddings = df_embeddings.iloc[indices]
        
        # Calculate Mean (Centroid) along axis 0 (columns)
        # Result is a Series of length 768
        centroid_vector = class_embeddings.mean(axis=0)
        
        # Store as a list for JSON serialization
        centroids[int(target_class)] = centroid_vector.tolist()
        
        print(f" - Target {target_class}: Centroid calculated from {len(class_embeddings)} samples.")

    # 5. Generate Output Data
    print("Constructing JSON output...")
    output_data = []

    for idx, row in df_metadata.iterrows():
        # Convert metadata row to dict
        slide_data = row.to_dict()
        
        # Get the target for this specific slide
        target_val = int(row['targets'])
        
        # Retrieve the calculated centroid for this target
        # This gives every slide in Cluster 0 the same Centroid 0, etc.
        slide_centroid = centroids.get(target_val)
        
        # Add to the record
        slide_data['cluster_centroid_embedding'] = slide_centroid
        
        output_data.append(slide_data)

    # 6. Save to JSON
    with open(output_file, 'w') as f:
        json.dump(output_data, f, indent=4)

    print(f"Success! Processed {len(output_data)} slides.")
    print(f"Output saved to: {output_file}")

if __name__ == "__main__":
    main()

Loading data...
Embeddings loaded: (173, 768)
Metadata loaded: (173, 4)
Calculating centroids based on targets...
 - Target 0: Centroid calculated from 137 samples.
 - Target 1: Centroid calculated from 36 samples.
Constructing JSON output...
Success! Processed 173 slides.
Output saved to: ../data/ERBB2_slide_centroid_data.json
