In [2]:
import numpy as np

In [1]:
data_dir = "/content/drive/MyDrive/FinalProject/mock_data/"
%cd {data_dir}

/content/drive/MyDrive/FinalProject/mock_data


In [3]:
similar_reports_inds = np.load("./top_10_reports_for_images_1_500.npz")['indexes']
similar_reports_inds.shape

(500, 10)

In [4]:
similar_reports_inds[0][0]

np.int64(2086)

In [5]:
report_embeddings = np.load("./cxr_report_embeddings_1_10000.npy")

In [8]:
report_embeddings = report_embeddings[500:,:]

In [9]:
report_embeddings.shape

(9500, 128)

In [10]:
num_images = 500

In [15]:
diverse_inds = np.full((num_images, 3), -1)
diverse_inds.shape

(500, 3)

In [16]:
from sklearn.metrics.pairwise import euclidean_distances

for i in range(num_images):
    top10_inds = similar_reports_inds[i, :]
    top10_embeddings = report_embeddings[top10_inds, :]

    # Initialize with the first report
    selected = [top10_inds[0]]
    remaining_inds = top10_inds[1:]
    remaining_embeddings = top10_embeddings[1:]

    while len(selected) < 3 and len(remaining_inds) > 0:
        # Compute min distance of each remaining report to selected reports
        dists = euclidean_distances(remaining_embeddings, report_embeddings[selected])
        min_dists = np.min(dists, axis=1)
        farthest_idx = np.argmax(min_dists)

        selected.append(remaining_inds[farthest_idx])
        remaining_inds = np.delete(remaining_inds, farthest_idx)
        remaining_embeddings = np.delete(remaining_embeddings, farthest_idx, axis=0)

    diverse_inds[i, :] = selected[:3]

In [22]:
similar_reports_inds[1]

array([5942, 5345, 6687, 5226, 1667, 4751, 1386, 7997, 4817, 3728])

In [20]:
diverse_inds

array([[2086, 7645, 1917],
       [5942, 3728, 1667],
       [4549, 3483, 4119],
       ...,
       [7689, 1478, 5413],
       [6526, 5957,  180],
       [5662, 8300, 7253]])

In [23]:
import numpy as np

# Save the array to a file
np.save('./top_3_reports_for_images_1_500_maximally_diverse_selection.npy', diverse_inds)

### K-Means Clustering

In [24]:
import numpy as np
from sklearn.cluster import KMeans

# Assume:
# similar_reports_inds.shape = (500, 10)  # Indices of top 10 similar reports for 500 images
# report_embeddings.shape = (9500, 128)  # All report embeddings

num_images = similar_reports_inds.shape[0]
diverse_inds = np.zeros((num_images, 3), dtype=int)  # Store 3 diverse indices per image

for i in range(num_images):
    # Get top 10 similar report indices for the i-th image
    top10_inds = similar_reports_inds[i, :]  # shape (10,)
    top10_embeddings = report_embeddings[top10_inds, :]  # shape (10, 128)

    # Cluster into 3 groups
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    kmeans.fit(top10_embeddings)

    # For each cluster, pick the closest report to the centroid
    selected_inds = []
    for cluster_id in range(3):
        # Get indices of reports in this cluster
        cluster_mask = (kmeans.labels_ == cluster_id)
        cluster_embeddings = top10_embeddings[cluster_mask]
        cluster_original_inds = top10_inds[cluster_mask]

        if len(cluster_embeddings) == 0:
            continue  # handle edge case (unlikely with n_clusters=3 and 10 points)

        # Find the report closest to the centroid
        centroid = kmeans.cluster_centers_[cluster_id]
        distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
        closest_idx_in_cluster = np.argmin(distances)
        selected_inds.append(cluster_original_inds[closest_idx_in_cluster])

    # If fewer than 3 clusters, fill with random remaining reports
    while len(selected_inds) < 3:
        remaining_inds = [ind for ind in top10_inds if ind not in selected_inds]
        selected_inds.append(np.random.choice(remaining_inds))

    diverse_inds[i, :] = selected_inds[:3]

# diverse_inds.shape = (500, 3)

In [27]:
similar_reports_inds[1]

array([5942, 5345, 6687, 5226, 1667, 4751, 1386, 7997, 4817, 3728])

In [25]:
diverse_inds

array([[8329, 4595, 7645],
       [5345, 4751, 5942],
       [3483, 4119, 4549],
       ...,
       [8687, 1478, 7150],
       [7651, 8651, 5957],
       [3210, 3273, 8300]])

In [28]:
import numpy as np

# Save the array to a file
np.save('./top_3_reports_for_images_1_500_k_means_clustering.npy', diverse_inds)

In [29]:
loaded_diverse_inds = np.load('./top_3_reports_for_images_1_500_k_means_clustering.npy')
loaded_diverse_inds.shape

(500, 3)