In [3]:
# Create local directory
!mkdir -p /home/jupyter/cloud_direct

# Copy data from your GCS bucket into the local directory
!gsutil -m cp -r gs://pons_aws_data/backup/datasets/BUSI/* /home/jupyter/cloud_direct/

# Verify files
!ls -lh /home/jupyter/cloud_direct

Copying gs://pons_aws_data/backup/datasets/BUSI/fold1.zip...
Copying gs://pons_aws_data/backup/datasets/BUSI/fold2.zip...                    
Copying gs://pons_aws_data/backup/datasets/BUSI/fold3.zip...                    
\ [3/3 files][  1.4 GiB/  1.4 GiB] 100% Done  79.9 MiB/s ETA 00:00:00           
Operation completed over 3 objects/1.4 GiB.                                      
total 1.5G
-rw-r--r-- 1 jupyter jupyter 487M Aug 19 12:44 fold1.zip
-rw-r--r-- 1 jupyter jupyter 487M Aug 19 12:44 fold2.zip
-rw-r--r-- 1 jupyter jupyter 487M Aug 19 12:44 fold3.zip


In [4]:
import zipfile
import os
folds = ["fold1.zip", "fold2.zip", "fold3.zip"]

for fold in folds:
    zip_path = f"/home/jupyter/cloud_direct/{fold}"
    extract_dir = f"/home/jupyter/{fold.replace('.zip', '')}"
    os.makedirs(extract_dir, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

    print(f"Extracted {fold} to {extract_dir}")


Extracted fold1.zip to /home/jupyter/fold1
Extracted fold2.zip to /home/jupyter/fold2
Extracted fold3.zip to /home/jupyter/fold3


## Data Handling

In [8]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# ==== Set Base Directory ====
base_dir = "/home/jupyter/fold1/fold1"
modalities = ["bmode", "enhanced", "improved"]  # All modalities we want to process

# ==== Extract Image File Paths and Labels ====
def load_image_paths_and_labels(base_path, label_name):
    image_files = [f for f in os.listdir(base_path) if f.lower().endswith(".png")]
    data = []

    for img_name in image_files:

        data.append((os.path.join(base_path, img_name), label_name))

    return data

# ==== Process Each Modality Separately ====
for modality in modalities:
    # Train paths
    BENIGN_train_data_path = os.path.join(base_dir, "train", modality, "benign")
    MALIGNANT_train_data_path = os.path.join(base_dir, "train", modality, "malignant")

    # Validation paths
    BENIGN_val_data_path = os.path.join(base_dir, "val", modality, "benign")
    MALIGNANT_val_data_path = os.path.join(base_dir, "val", modality, "malignant")

    # Test paths
    BENIGN_test_data_path = os.path.join(base_dir, "test", modality, "benign")
    MALIGNANT_test_data_path = os.path.join(base_dir, "test", modality, "malignant")

    # Load data for both classes
    benign_train_data = load_image_paths_and_labels(BENIGN_train_data_path, label_name=0)
    malignant_train_data = load_image_paths_and_labels(MALIGNANT_train_data_path, label_name=1)

    benign_val_data = load_image_paths_and_labels(BENIGN_val_data_path, label_name=0)
    malignant_val_data = load_image_paths_and_labels(MALIGNANT_val_data_path, label_name=1)

    benign_test_data = load_image_paths_and_labels(BENIGN_test_data_path, label_name=0)
    malignant_test_data = load_image_paths_and_labels(MALIGNANT_test_data_path, label_name=1)

    # Merge and shuffle
    all_train_data = benign_train_data + malignant_train_data
    all_val_data = benign_val_data + malignant_val_data
    all_test_data = benign_test_data + malignant_test_data

    # Convert to DataFrame with modality-specific variable names
    globals()[f"df_train_{modality}"] = pd.DataFrame(all_train_data, columns=["image_path", "label"])
    globals()[f"df_val_{modality}"] = pd.DataFrame(all_val_data, columns=["image_path", "label"])
    globals()[f"df_test_{modality}"] = pd.DataFrame(all_test_data, columns=["image_path", "label"])

    # Print shapes
    print(f"\nModality: {modality}")
    print("Train:", globals()[f"df_train_{modality}"].shape)
    print("Val:", globals()[f"df_val_{modality}"].shape)
    print("Test:", globals()[f"df_test_{modality}"].shape)



Modality: bmode
Train: (517, 2)
Val: (90, 2)
Test: (40, 2)

Modality: enhanced
Train: (517, 2)
Val: (90, 2)
Test: (40, 2)

Modality: improved
Train: (517, 2)
Val: (90, 2)
Test: (40, 2)
