In [5]:
# scripts/prepare_data.py
# ============================================
# Data Preparation Script (VS Code)
# ============================================

import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

# ------------------------------------------------
# CONFIG
# ------------------------------------------------
NIH_IMAGE_DIR = "data/NIH/images"
TB_DATASETS = {
    "Shenzhen": "data/Shenzhen",
    "Montgomery": "data/Montgomery"
}

OUTPUT_DIR = "data"
RANDOM_SEED = 42

random.seed(RANDOM_SEED)

# ------------------------------------------------
# 1. Prepare NIH Dataset (MAE - Unsupervised)
# ------------------------------------------------
def prepare_nih_dataset():
    print("\nPreparing NIH dataset (for MAE)...")

    images = [
        f for f in os.listdir(NIH_IMAGE_DIR)
        if f.lower().endswith((".png", ".jpg", ".jpeg"))
    ]

    print(f"Total NIH images found: {len(images)}")

    df = pd.DataFrame({
        "image_path": [os.path.join(NIH_IMAGE_DIR, img) for img in images]
    })

    train_df, val_df = train_test_split(
        df,
        test_size=0.1,
        random_state=RANDOM_SEED
    )

    train_df["split"] = "train"
    val_df["split"] = "val"

    nih_splits = pd.concat([train_df, val_df])
    csv_path = os.path.join(OUTPUT_DIR, "nih_splits.csv")
    nih_splits.to_csv(csv_path, index=False)

    print(f"NIH split saved to: {csv_path}")
    print("Train:", len(train_df), "Val:", len(val_df))


# ------------------------------------------------
# 2. Prepare TB Dataset (ViT - Supervised)
# ------------------------------------------------
def prepare_tb_dataset():
    print("\nPreparing TB datasets (for ViT)...")

    records = []

    for dataset_name, base_path in TB_DATASETS.items():
        for label_name, label_value in [("Normal", 0), ("TB", 1)]:
            class_dir = os.path.join(base_path, label_name)

            if not os.path.exists(class_dir):
                raise FileNotFoundError(f"Missing folder: {class_dir}")

            for img in os.listdir(class_dir):
                if img.lower().endswith((".png", ".jpg", ".jpeg")):
                    records.append({
                        "image_path": os.path.join(class_dir, img),
                        "label": label_value,
                        "dataset": dataset_name
                    })

    df = pd.DataFrame(records)
    print("Total TB images:", len(df))

    # Stratified split
    train_df, temp_df = train_test_split(
        df,
        test_size=0.3,
        stratify=df["label"],
        random_state=RANDOM_SEED
    )

    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        stratify=temp_df["label"],
        random_state=RANDOM_SEED
    )

    train_df["split"] = "train"
    val_df["split"] = "val"
    test_df["split"] = "test"

    tb_splits = pd.concat([train_df, val_df, test_df])
    csv_path = os.path.join(OUTPUT_DIR, "tb_labels_with_splits.csv")
    tb_splits.to_csv(csv_path, index=False)

    print(f"TB splits saved to: {csv_path}")
    print("Train:", len(train_df))
    print("Val  :", len(val_df))
    print("Test :", len(test_df))


# ------------------------------------------------
# MAIN
# ------------------------------------------------
if __name__ == "__main__":
    prepare_nih_dataset()
    prepare_tb_dataset()
    print("\n✅ DATA PREPARATION COMPLETED SUCCESSFULLY")



Preparing NIH dataset (for MAE)...
Total NIH images found: 14999
NIH split saved to: data\nih_splits.csv
Train: 13499 Val: 1500

Preparing TB datasets (for ViT)...
Total TB images: 800
TB splits saved to: data\tb_labels_with_splits.csv
Train: 560
Val  : 120
Test : 120

✅ DATA PREPARATION COMPLETED SUCCESSFULLY
