In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data_dir = '/content/drive/MyDrive/FinalProject/mock_data'

In [3]:
%cd {data_dir}

/content/drive/MyDrive/FinalProject/mock_data


In [4]:
# Assuming you have the embeddings loaded as follows:
# caption_embeddings: numpy array with shape (10000, 128)
# image_embeddings: numpy array with shape (65, 32, 128)
report_embeddings = np.load("./cxr_report_embeddings_1_10000.npy")
image_embeddings = np.load("./cxr_img_embeddings_1_500.npz")['embeddings']

In [5]:
print(report_embeddings.shape)
print(image_embeddings.shape)

(10000, 128)
(500, 32, 128)


In [8]:
report_embeddings = report_embeddings[500:,:]
report_embeddings.shape

(9500, 128)

In [9]:
# Initialize lists to store the results
top_10_indexes = []
top_10_scores = []

# Compute the similarity scores and find the top 10 captions for each image
for img_embedding in image_embeddings:
    # Compute similarity scores for all patches in the image with all captions
    similarity_scores = np.zeros((img_embedding.shape[0], report_embeddings.shape[0]))
    for i, patch_embedding in enumerate(img_embedding):
        similarity_scores[i] = cosine_similarity([patch_embedding], report_embeddings)[0]

    # Aggregate similarity scores by taking the maximum for each caption
    max_similarity_scores = np.max(similarity_scores, axis=0)

    # Find the top 10 captions
    top_10_idx = np.argsort(max_similarity_scores)[-10:][::-1]
    top_10_scr = max_similarity_scores[top_10_idx]

    # Store the results
    top_10_indexes.append(top_10_idx)
    top_10_scores.append(top_10_scr)

In [11]:
len(top_10_indexes)

500

In [10]:
top_10_indexes[:3]

[array([2086, 7645, 6105, 4524, 5226,  544, 4595, 1917, 7256, 8329]),
 array([5942, 5345, 6687, 5226, 1667, 4751, 1386, 7997, 4817, 3728]),
 array([4549,  480, 2130, 5688, 3414, 4119, 6585, 3483, 1588, 4648])]

In [12]:
top_10_scores[:3]

[array([0.92905563, 0.89844728, 0.89056641, 0.88970786, 0.8832463 ,
        0.87821293, 0.87604094, 0.87498873, 0.87305534, 0.87067485]),
 array([0.90287203, 0.88904583, 0.88482773, 0.88218457, 0.88064742,
        0.8799184 , 0.87950754, 0.87479502, 0.87451625, 0.87451577]),
 array([0.91311383, 0.89008033, 0.88571119, 0.88426602, 0.8822819 ,
        0.8757565 , 0.87259221, 0.86866546, 0.86835843, 0.86793244])]

In [13]:
unique_list = []
for inds in top_10_indexes:
  for ind in inds:
    if ind not in unique_list:
      unique_list.append(ind)
print(len(unique_list))

1680


In [14]:
# Convert lists to numpy arrays for easier storage
top_10_indexes = np.array(top_10_indexes)
top_10_scores = np.array(top_10_scores)

# Save the indexes and similarity scores into a single npz file
np.savez('./top_10_reports_for_images_1_500.npz', indexes=top_10_indexes, scores=top_10_scores)