In [1]:
import pandas as pd
import numpy as np

from huggingface_hub import hf_hub_download
HF_REPO_ID = "google/cxr-foundation"


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Download precomputed embeddings for images and clinical features

In [3]:
EMBEDDINGS_NPZ_FILE_PATH = '/content/drive/MyDrive/FinalProject/mock_data/cxr_img_embeddings_1_500.npz'
embeddings = np.load(EMBEDDINGS_NPZ_FILE_PATH)['embeddings']

image_ids = []
embedding_list = [] # Populate the lists
for idx, embedding in enumerate(embeddings):
  image_ids.append(idx)
  embedding_list.append(embedding) # Create the DataFrame

image_embeddings_df = pd.DataFrame({ 'image_id': image_ids, 'embeddings': embedding_list })

TEXT_EMBEDDINGS_NPZ_FILE_PATH = "/content/drive/MyDrive/FinalProject/mock_data/clinical_features_cxr_embeddings.npz"

# Read text embeddings
text_embeddings_file = np.load(TEXT_EMBEDDINGS_NPZ_FILE_PATH)
text_embeddings_queries = list(text_embeddings_file.keys())
text_embeddings_df = pd.DataFrame(
    [(key, text_embeddings_file[key]) for key in text_embeddings_file.keys()],
    columns=['query', 'embeddings']
)
text_embeddings_file.close()

In [4]:
text_embeddings_df.head()

Unnamed: 0,query,embeddings
0,Enlarged Cardiomediastinum,"[-0.06839325, 0.016583007, -0.043305214, 0.006..."
1,enlarged cardiomediastinum,"[-0.06839325, 0.016583007, -0.043305214, 0.006..."
2,widened cardiomediastinum,"[-0.18843561, -0.03300752, -0.11934988, -0.053..."
3,no acute cardiopulmonary process,"[-0.05057713, -0.061462577, -0.039064746, -0.1..."
4,cardiomediastinal silhouette is normal,"[0.13671014, -0.014474957, 0.12604094, -0.1434..."


In [5]:
text_embeddings_df[text_embeddings_df['query'] == 'no pleural effusion']

Unnamed: 0,query,embeddings
254,no pleural effusion,"[0.097669154, -0.1268722, -0.0133012, -0.16463..."


In [6]:
len(embedding_list)

500

In [7]:
# @title Similarity and Zero-shot Classification Functions

import numpy as np

def compute_image_text_similarity(image_emb, txt_emb):
  image_emb = np.reshape(image_emb, (32, 128))
  similarities = []
  for i in range(32):
    # cosine similarity
    similarity = np.dot(image_emb[i], txt_emb)/(np.linalg.norm(image_emb[i]) * np.linalg.norm(txt_emb))
    similarities.append(similarity)
  np_sm_similarities = np.array((similarities))
  return np.max(np_sm_similarities)

def extract_clinical_features(image_emb, features_emb, features):
  extracted_features = []
  similarities_dict = {}
  for i, feature_emb in enumerate(features_emb):
    similarity = compute_image_text_similarity(image_emb, feature_emb)
    similarities_dict[features[i]] = similarity

  if similarities_dict['Cardiomegaly'] >= similarities_dict['heart size is normal']:
    extracted_features.append("Cardiomegaly")
  else:
    extracted_features.append("heart size is normal")

  if similarities_dict['small pneumothorax'] >= similarities_dict['no pneumothorax']:
    extracted_features.append("small pneumothorax")
  else:
    extracted_features.append("no pneumothorax")

  if similarities_dict['Airspace Opacity'] >= similarities_dict['no evidence of airspace disease']:
    extracted_features.append('Airspace Opacity')
  else:
    extracted_features.append('no evidence of airspace disease')

  if similarities_dict['small pleural effusion'] >= similarities_dict['no pleural effusion']:
    if similarities_dict['large pleural effusion'] >= similarities_dict['small pleural effusion']:
      extracted_features.append("large pleural effusion")
    else:
      extracted_features.append("small pleural effusion")
  else:
    extracted_features.append("no pleural effusion")

  return extracted_features

In [8]:
features = ["Cardiomegaly", "heart size is normal", "small pneumothorax", "no pneumothorax", \
            "Airspace Opacity", "no evidence of airspace disease", "large pleural effusion",\
            "small pleural effusion", "no pleural effusion"]


In [9]:
feature_embeddings = []
for feature in features:
  feature_emb = text_embeddings_df[text_embeddings_df['query'] == feature]['embeddings'].iloc[0]
  feature_embeddings.append(feature_emb)

In [10]:
features_for_images = []

for img_emb in embedding_list:
  extracted_features = extract_clinical_features(img_emb, feature_embeddings, features)
  features_for_images.append(extracted_features)

In [11]:
len(features_for_images)

500

In [12]:
features_for_images[:3]

[['heart size is normal',
  'no pneumothorax',
  'no evidence of airspace disease',
  'small pleural effusion'],
 ['Cardiomegaly',
  'no pneumothorax',
  'Airspace Opacity',
  'large pleural effusion'],
 ['heart size is normal',
  'no pneumothorax',
  'Airspace Opacity',
  'no pleural effusion']]

In [13]:
data_dir = "/content/drive/MyDrive/FinalProject/mock_data/"
%cd {data_dir}

/content/drive/MyDrive/FinalProject/mock_data


In [14]:
import json

# Store the list on disk
with open('./clinical_features_for_images_1_500.json', 'w') as file:
    json.dump(features_for_images, file)

In [None]:
# Load the list from disk
with open('./clinical_features_for_images_0_64.json', 'r') as file:
    loaded_list = json.load(file)

print(loaded_list)

[['heart size is normal', 'no pneumothorax', 'no evidence of airspace disease', 'large left pleural effusion'], ['Cardiomegaly', 'no pneumothorax', 'Airspace Opacity', 'large right pleural effusion'], ['heart size is normal', 'no pneumothorax', 'Airspace Opacity', 'no pleural effusion'], ['Cardiomegaly', 'no pneumothorax', 'Airspace Opacity', 'large left pleural effusion'], ['Cardiomegaly', 'no pneumothorax', 'Airspace Opacity', 'large left pleural effusion'], ['Cardiomegaly', 'no pneumothorax', 'Airspace Opacity', 'large bilateral pleural effusions'], ['Cardiomegaly', 'no pneumothorax', 'no evidence of airspace disease', 'no pleural effusion'], ['heart size is normal', 'no pneumothorax', 'no evidence of airspace disease', 'no pleural effusion'], ['Cardiomegaly', 'no pneumothorax', 'Airspace Opacity', 'large left pleural effusion'], ['heart size is normal', 'no pneumothorax', 'Airspace Opacity', 'large left pleural effusion'], ['Cardiomegaly', 'no pneumothorax', 'Airspace Opacity', 'la

In [None]:
len(loaded_list)

65