## Step 0: Mounting Google Drive and Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/multimodal-xray-agent

!ls

Mounted at /content/drive
/content/drive/MyDrive/multimodal-xray-agent
app	      data	  LICENSE  notebooks	   README.md	     scripts
chexpert.zip  deployment  models   PROJECT_LOG.md  requirements.txt  src


In [2]:
import os, shutil
from pathlib import Path
from PIL import Image
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import torch
from torchvision import transforms
import itertools, pprint
import glob
import random

In [3]:
from src.chexpert_preprocessing import process_one

## Step 1: Verifying GPU and Environment

In [4]:
# Device-agnostic setup
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    device = torch.device("cuda")
    print(f"GPU detected: {device_name}")
else:
    device = torch.device("cpu")
    print("GPU not detected. Falling back to CPU.")

print(f"Running on device: {device}")

GPU detected: NVIDIA L4
Running on device: cuda


## Step 2: Loading CheXpert Data to Local SSD

In [7]:
!cp "./data/images_sample/chexpert_flat.zip" /content/

In [8]:
!unzip -q /content/chexpert_flat.zip -d /content

In [10]:
!find /content/chexpert_flat -type f | head -n 10

/content/chexpert_flat/patient36133_study1_view1_frontal.jpg
/content/chexpert_flat/patient24375_study8_view2_lateral.jpg
/content/chexpert_flat/patient28388_study17_view2_frontal.jpg
/content/chexpert_flat/patient19409_study2_view1_frontal.jpg
/content/chexpert_flat/patient18073_study7_view2_lateral.jpg
/content/chexpert_flat/patient08465_study1_view1_frontal.jpg
/content/chexpert_flat/patient06287_study1_view2_lateral.jpg
/content/chexpert_flat/patient39457_study1_view1_frontal.jpg
/content/chexpert_flat/patient01714_study1_view2_lateral.jpg
/content/chexpert_flat/patient56675_study1_view1_frontal.jpg


## Step 3: Defining Paths

In [11]:
IN_DIR  = Path("/content/chexpert_flat").resolve()

In [12]:
OUT_DIR = Path("./data/images_sample/chexpert").resolve()

In [13]:
OUT_DIR.mkdir(parents=True, exist_ok=True)

## Step 4: Define Transforms

In [14]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),          # [0,1]  float32
])

## Step 5: Launch Parallel Processing (Preprocessing)

In [23]:
with ThreadPoolExecutor(max_workers=4) as pool:
    results = list(tqdm(pool.map(process_one, image_paths), total=len(image_paths)))

print(f"\n Preprocessed {sum(results):,} / {len(image_paths):,} images into {OUT_DIR}")

  8%|▊         | 18650/223416 [01:33<19:11, 177.90it/s]

[ERROR] /content/chexpert_flat/._view1_frontal.jpg: cannot identify image file '/content/chexpert_flat/._view1_frontal.jpg'


 37%|███▋      | 82938/223416 [06:54<10:47, 217.03it/s]

[ERROR] /content/chexpert_flat/patient00001_study1_._view1_frontal.jpg: cannot identify image file '/content/chexpert_flat/patient00001_study1_._view1_frontal.jpg'


100%|██████████| 223416/223416 [18:54<00:00, 196.89it/s]


 Preprocessed 223,414 / 223,416 images into /content/drive/MyDrive/multimodal-xray-agent/data/images_sample/chexpert





In [24]:
img_count = 0
for root, _, files in os.walk(OUT_DIR):
    img_count += sum(f.lower().endswith(".png") for f in files)

print(f"Total images found in chexpert folder: {img_count}")

Total images found in chexpert folder: 223414


In [25]:
sample_path = "./data/images_sample/chest14_raw/images-224/images-224"

## Step 6: Verifying ChestX-ray14 Dataset

In [30]:
!cp "./data/images_sample/chest14.zip" /content/

In [31]:
!unzip -q /content/chest14.zip -d /content

In [35]:
!find /content/images-224 -type f | head -n 5

/content/images-224/images-224/00000001_000.png
/content/images-224/images-224/00000001_001.png
/content/images-224/images-224/00000001_002.png
/content/images-224/images-224/00000002_000.png
/content/images-224/images-224/00000003_000.png


In [37]:
# Set path
CHEST14_PATH = "/content/images-224/images-224"

In [38]:
sample_files = [f for f in os.listdir(CHEST14_PATH) if f.endswith(".png")]

In [39]:
sample_subset = random.sample(sample_files, 5)

In [40]:
# Check grayscale status
for fname in sample_subset:
    fpath = os.path.join(CHEST14_PATH, fname)
    try:
        img = Image.open(fpath)
        arr = np.array(img)

        print(f"\n {fname}")
        print(f" Mode: {img.mode}")                     # "L" = grayscale, "RGB" = color
        print(f" Shape: {arr.shape}")                  # Should be (224, 224) if grayscale
        print(f" Unique channels: {arr.ndim}")         # 2D = grayscale, 3D = RGB

    except Exception as e:
        print(f"[ERROR] {fname}: {e}")


 00017824_002.png
 Mode: L
 Shape: (224, 224)
 Unique channels: 2

 00021245_001.png
 Mode: L
 Shape: (224, 224)
 Unique channels: 2

 00002182_007.png
 Mode: L
 Shape: (224, 224)
 Unique channels: 2

 00001837_003.png
 Mode: L
 Shape: (224, 224)
 Unique channels: 2

 00009239_007.png
 Mode: L
 Shape: (224, 224)
 Unique channels: 2


### Note: Looks like the images are already in a format that is that is fit for indexing!  