In [4]:
import numpy as np
import argparse
import json
from scipy.spatial.distance import pdist, squareform

In [5]:
import numpy as np
import argparse
import json
import sys

def generate_subset(n_classes, dist_matrix_path="./cifar100_kl_div_matrix.npy", class_names_path="./cifar100_class_names.npy"):
    assert n_classes > 1, "Number of classes must be greater than 1."

    # Load the KL divergence matrix and class names
    kls_cf_matrix = np.load(dist_matrix_path)
    class_names = np.load(class_names_path)

    # Initialize a random starting class
    rand_class_ind = np.random.randint(0, class_names.shape[0])
    selected_classes = {rand_class_ind}

    # Iteratively select classes to minimize variance in pairwise distances
    for _ in range(n_classes - 1):
        # Compute the pairwise distances for the current subset
        subset_dists = []
        for i in range(len(selected_classes)):
            for j in range(i + 1, len(selected_classes)):
                subset_dists.append(kls_cf_matrix[list(selected_classes)[i], list(selected_classes)[j]])

        # Compute the average distance in the current subset
        avg_dist = np.mean(subset_dists)

        # Find the next class that minimizes the variance in pairwise distances
        min_variance = float('inf')
        best_class = -1

        for candidate_class in range(class_names.shape[0]):
            if candidate_class not in selected_classes:
                # Compute the new pairwise distances if this candidate is added
                new_dists = subset_dists.copy()
                for selected_class in selected_classes:
                    new_dists.append(kls_cf_matrix[selected_class, candidate_class])

                # Compute the variance of the new distances
                new_variance = np.var(new_dists)

                # Update the best candidate if this one has lower variance
                if new_variance < min_variance:
                    min_variance = new_variance
                    best_class = candidate_class

        # Add the best candidate to the subset
        selected_classes.add(best_class)

    selected_classes = list(selected_classes)

    # Compute distances for the subset
    subset_dists = []
    for i in range(len(selected_classes)):
        for j in range(i + 1, len(selected_classes)):
            subset_dists.append(kls_cf_matrix[selected_classes[i], selected_classes[j]])

    return {
        "classes": selected_classes,
        "max_dist": max(subset_dists),
        "avg_dist": np.mean(subset_dists),
        "variance_dist": np.var(subset_dists),
    }

def generate_multiple_subsets(n_subsets, n_classes, dist_matrix_path, class_names_path):
    results = []
    for _ in range(n_subsets):
        result = generate_subset(n_classes, dist_matrix_path, class_names_path)

        # Convert numpy types to native Python types
        result["classes"] = [int(c) for c in result["classes"]]
        result["max_dist"] = float(result["max_dist"])
        result["avg_dist"] = float(result["avg_dist"])
        result["variance_dist"] = float(result["variance_dist"])
        results.append(result)
    return results


if __name__ == "__main__":
    # Check if running in an interactive environment
    if sys.argv[0].endswith("ipykernel_launcher.py"):
        # Default arguments for interactive use
        n_subsets = 10
        n_classes = 5
        dist_matrix_path = "./cifar100_kl_div_matrix.npy"
        class_names_path = "./cifar100_class_names.npy"
        output_path = "./subsets.json"
    else:
        # Command-line arguments
        parser = argparse.ArgumentParser(description="Generate multiple subsets of CIFAR-100 classes with equal distances.")
        parser.add_argument("--n_subsets", type=int, required=True, help="Number of subsets to generate.")
        parser.add_argument("--n_classes", type=int, required=True, help="Number of classes in each subset.")
        parser.add_argument("--dist_matrix_path", type=str, default="./cifar100_kl_div_matrix.npy", help="Path to KL divergence matrix.")
        parser.add_argument("--class_names_path", type=str, default="./cifar100_class_names.npy", help="Path to class names file.")
        parser.add_argument("--output_path", type=str, required=True, help="Path to save the generated subsets.")
        args = parser.parse_args()

        n_subsets = args.n_subsets
        n_classes = args.n_classes
        dist_matrix_path = args.dist_matrix_path
        class_names_path = args.class_names_path
        output_path = args.output_path

    # Generate subsets
    subsets = generate_multiple_subsets(
        n_subsets, n_classes, dist_matrix_path, class_names_path
    )

    # Save the subsets to a file
    with open(output_path, "w") as f:
        json.dump(subsets, f, indent=4)

    print(f"Generated {n_subsets} subsets and saved to {output_path}")

Generated 10 subsets and saved to ./subsets.json
