## Step 0: Mounting Google Drive and Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/multimodal-xray-agent

!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/multimodal-xray-agent
app	      data	  LICENSE  notebooks	   README.md	     scripts
chexpert.zip  deployment  models   PROJECT_LOG.md  requirements.txt  src


In [2]:
import os, uuid, json, torch
from tqdm import tqdm
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import torchvision.transforms as T
from open_clip import create_model_from_pretrained, get_tokenizer

In [2]:
!pip install open_clip_torch -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Step 1: Verifying GPU and Environment

In [3]:
# Device-agnostic setup
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device = torch.device("cuda")
    print(f"GPU detected: {device_name}")
else:
    device = torch.device("cpu")
    print("GPU not detected. Falling back to CPU.")

print(f"Running on device: {device}")

GPU detected: NVIDIA L4
Running on device: cuda


## Step 2: Setting Up Paths

In [14]:
ROOT_DIR = "/content/drive/MyDrive/multimodal-xray-agent"
IMG_DIR_CHEXPERT = os.path.join(ROOT_DIR, "data/images_sample/chexpert")
IMG_DIR_CHEST14 = os.path.join(ROOT_DIR, "data/images_sample/chest14")
INDEX_OUT_DIR = os.path.join(ROOT_DIR, "data/indexes")
META_OUT_PATH = os.path.join(ROOT_DIR, "data/indexes/image_metadata.jsonl") # This is where the image metadata is stored

os.makedirs(INDEX_OUT_DIR, exist_ok=True)

##  Step 3: Load Vision Encoder: BiomedCLIP (OpenCLIP ViT-B/16)

We use `microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224` as our vision encoder.
This model is based on OpenCLIP's ViT-B/16 architecture and was pretrained on MIMIC-CXR, PMC figures, and PubMed.

Key details:
- Loaded via `open_clip.create_model_from_pretrained(...)`
- Preprocessing pipeline returned automatically (resize -> center crop -> normalize)
- `.encode_image()` returns 512-D `float32` embeddings
- All weights are frozen (inference only)
- Compatible with FAISS indexing and dual-modality retrieval

Why not `transformers.CLIPModel`?
- BiomedCLIP is not hosted in Hugging Face's `transformers` API. It uses OpenCLIP internals and requires direct loading via `open_clip_torch`.

Here is the link to the model in Hugging Face: https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224#model-use

In [5]:
model_name = "ViT-B-16"

In [6]:
hf_repo = "hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224"

In [7]:
# Load model and preprocessing from Hugging Face Hub
model, preprocess = create_model_from_pretrained(
    "hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_pytorch_model.bin:   0%|          | 0.00/784M [00:00<?, ?B/s]

open_clip_config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [8]:
model = model.to(device).eval()

In [9]:
print("Device:", next(model.parameters()).device)

Device: cuda:0


In [10]:
print("Output shape (dummy):", model.encode_image(preprocess(Image.new("RGB", (224, 224))).unsqueeze(0).to(device)).shape)

Output shape (dummy): torch.Size([1, 512])


## Step 4: Collect All Image Paths and Assign UUIDs

In [15]:
EMBEDDING_OUT_PATH = os.path.join(INDEX_OUT_DIR, "image_embeddings.pt")
UUIDS_OUT_PATH = os.path.join(INDEX_OUT_DIR, "image_uuids.json")

In [16]:
# Load metadata
with open(META_OUT_PATH, "r") as f:
    image_metadata = [json.loads(line) for line in f]

In [17]:
# Prepare transform (already defined as `preprocess`)
all_embeddings = []
all_uuids = []

In [18]:
for entry in tqdm(image_metadata, desc="Embedding images"):
    img_path = os.path.join(ROOT_DIR, entry["path"])  # Absolute path from Drive root
    try:
        img = Image.open(img_path).convert("RGB")
        img_tensor = preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            emb = model.encode_image(img_tensor).cpu().numpy()
        all_embeddings.append(emb)
        all_uuids.append(entry["uuid"])
    except Exception as e:
        print(f"[ERROR] {img_path}: {e}")

Embedding images:   0%|          | 1/335534 [02:00<11210:07:04, 120.28s/it]

[ERROR] /content/drive/MyDrive/multimodal-xray-agent/data/images_sample/chexpert/patient39807_study1_view1_frontal.png: [Errno 5] Input/output error: '/content/drive/MyDrive/multimodal-xray-agent/data/images_sample/chexpert/patient39807_study1_view1_frontal.png'


Embedding images:   0%|          | 1/335534 [02:45<15406:22:45, 165.30s/it]


KeyboardInterrupt: 

In [None]:
# Stack and save embeddings
embeddings = np.vstack(all_embeddings)
torch.save(torch.tensor(embeddings), EMBEDDING_OUT_PATH)


In [None]:
# Save UUIDs
with open(UUIDS_OUT_PATH, "w") as f:
    json.dump(all_uuids, f)

In [None]:
print(f"Saved {len(all_uuids)} image embeddings to {EMBEDDING_OUT_PATH}")