In [None]:
import pandas as pd

d1 = pd.read_csv('/data/MIMIC-CXR/mimic-cxr-2.0.0-split.csv')
d2 = pd.read_csv('/data/MIMIC-CXR/mimic_finding+impression2.csv')
# print(d1)
# print(d2)

d3 = d1.merge(d2, on='subject_id', how='left')
d3.drop_duplicates(subset=['dicom_id'], inplace=True)
d3['dicom_id'] = d3['dicom_id'].apply(lambda x: x + '.jpg')
d3.drop(columns=['impression'], inplace=True)
d3.dropna(inplace=True)
# print(d3)
# d3.to_csv('CXR_paths_for_images_and_text.csv', index=False)


In [None]:
# image retrieval from text
import pandas as pd
import numpy as np
from PIL import Image, ImageFile
import os
import torchvision.transforms as transforms
import yaml
from tqdm import tqdm
import torch
from transformers import AutoTokenizer

from train import SimCLR
from models.model import ModelCLR
from dataloader.dataset_wrapper import DataSetWrapper


data_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomResizedCrop(size=224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    # transforms.RandomApply([color_jitter], p=0.8),
    transforms.RandomGrayscale(p=0.2),
    # GaussianBlur(kernel_size=int(0.1 * self.input_shape[0])),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

candidate = pd.read_csv('../convirt/text-retrieval/candidate.csv')
query = pd.read_csv('../convirt/text-retrieval/query.csv')
features = query['Variable'].unique().tolist()
img_root_dir = '/data/CheXpert/'

all_images = []
all_labels = []
for idx in range(len(candidate)):
    img_name = os.path.join(img_root_dir, candidate.iloc[idx, 0])
    image = Image.open(img_name).convert('RGB')
    all_images.append(data_transforms(image))
    
    one_hot = candidate.loc[idx, features].values.astype(int)
    ind = np.where(one_hot==1)[0][0]
    all_labels.append(features[ind])


# =================== get image embeddings ===================
config = yaml.load(open("config.yaml", "r"), Loader=yaml.FullLoader)
config['fine_tune_from'] = 'Apr16_10-42-55_sunlab-serv-03.cs.illinois.edu'
dataset = DataSetWrapper(config['batch_size'], **config['dataset'])
simclr = SimCLR(dataset, config)
model = ModelCLR(**config["model"]).to("cpu")
model = simclr._load_pre_trained_weights(model)

image_embeddings = []
for i in tqdm(range(len(all_images))):
    image_embeddings.append(model.image_encoder(all_images[i].unsqueeze(0))[1])
image_embeddings = torch.stack(image_embeddings)

# =================== get text embeddings ===================
tokenizer = AutoTokenizer.from_pretrained(config['model']['bert_base_model'])#, do_lower_case=config['model_bert']['do_lower_case'])
text_embeddings = []
for i in tqdm(range(len(query["Text"]))):
    text = query["Text"].values[i]
    tokens = tokenizer([text], 
                        return_tensors="pt", 
                        padding=True, 
                        truncation=config['truncation'])
    text_embeddings.append(model.text_encoder(tokens))
text_embeddings = torch.cat(text_embeddings, dim=0)


In [None]:
print(text_embeddings.shape, image_embeddings.shape)
cos = torch.matmul(text_embeddings, image_embeddings.T).detach().numpy()
print(cos.shape)

cos_sorted = cos.argsort(axis=1)[:,::-1]
text_labels = query['Variable'].values
p_a_k5 = []
p_a_k10 = []
p_a_k50 = []

for i in range(0,40):
    preds = cos_sorted[i]
    hits = np.array([all_labels[pred]==text_labels[i] for pred in preds])
    p_a_k5.append(hits[:5].sum()/5)
    p_a_k10.append(hits[:10].sum()/10)
    p_a_k50.append(hits[:50].sum()/50)
    print("p@5 {} p@10 {} p@50 {}".format(p_a_k5[-1], p_a_k10[-1], p_a_k50[-1]))

print("p@5 {} p@10 {} p@p@50 {}".format(np.mean(p_a_k5), np.mean(p_a_k10), np.mean(p_a_k50)))

In [None]:
# image retrieval from images
candidate = pd.read_csv('../convirt/image-retrieval/candidate.csv')
query = pd.read_csv('../convirt/image-retrieval/query.csv')
features = query['Variable'].unique().tolist()
img_root_dir = '/data/CheXpert/'

query_images = []
for idx in range(len(query)):
    img_name = os.path.join(img_root_dir, query.iloc[idx, 1])
    image = Image.open(img_name).convert('RGB')
    query_images.append(data_transforms(image))
    

# =================== get image embeddings ===================
query_image_embeddings = []
for i in tqdm(range(len(query_images))):
    query_image_embeddings.append(model.image_encoder(query_images[i].unsqueeze(0))[1])
query_image_embeddings = torch.stack(query_image_embeddings)

In [None]:
print(query_image_embeddings.shape, image_embeddings.shape)
cos = torch.matmul(query_image_embeddings, image_embeddings.T).detach().numpy()
print(cos.shape)

cos_sorted = cos.argsort(axis=1)[:,::-1]
text_labels = query['Variable'].values
p_a_k5 = []
p_a_k10 = []
p_a_k50 = []

for i in range(0,40):
    preds = cos_sorted[i]
    hits = np.array([all_labels[pred]==text_labels[i] for pred in preds])
    p_a_k5.append(hits[:5].sum()/5)
    p_a_k10.append(hits[:10].sum()/10)
    p_a_k50.append(hits[:50].sum()/50)
    print("p@5 {} p@10 {} p@50 {}".format(p_a_k5[-1], p_a_k10[-1], p_a_k50[-1]))

print("p@5 {} p@10 {} p@p@50 {}".format(np.mean(p_a_k5), np.mean(p_a_k10), np.mean(p_a_k50)))

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, RocCurveDisplay, auc, roc_curve

def auc_scorer(estimator, X, y):
    y_pred = estimator.predict_proba(X)
    return roc_auc_score(y, y_pred, multi_class='ovr')


X = image_embeddings.detach().numpy()
y = np.array(all_labels)
classifier = LogisticRegression(random_state=0, max_iter=1000)

cross_val_score(classifier, X, y, cv=5, scoring=auc_scorer).mean()
# cv = StratifiedKFold(n_splits=5)
# tprs = []
# aucs = []
# mean_fpr = np.linspace(0, 1, 100)
# # fig, ax = plt.subplots(figsize=(6, 6))
# for fold, (train, test) in enumerate(cv.split(X, y)):
#     classifier.fit(X[train], y[train])
#     pred = classifier.predict_proba(X[test])
#     fpr, tpr, thresholds = roc_curve(y, pred)
#     roc_auc = auc(fpr, tpr)
#     interp_tpr = np.interp(mean_fpr, fpr, tpr)

#     interp_tpr[0] = 0.0
#     tprs.append(interp_tpr)
#     aucs.append(roc_auc)
# # ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")

# mean_tpr = np.mean(tprs, axis=0)
# mean_tpr[-1] = 1.0
# mean_auc = auc(mean_fpr, mean_tpr)
# print(mean_auc)