# Title: Data Preparation for Multimodal Pneumonia Detection
# Description: Standardizes NIH, CheXpert, and RSNA datasets into unified CSVs.

In [None]:

import pandas as pd
from pathlib import Path

from src import config


# Define base directories for each dataset

In [None]:
nih_base = Path(config.NIH_PATH)
chex_base = Path(config.CHEXPERT_PATH)
rsna_base = Path(config.RSNA_PATH)

# Example: load existing metadata CSVs (replace with actual paths)
nih = pd.read_csv(nih_base / "metadata_nih.csv")
chex = pd.read_csv(chex_base / "metadata_chexpert.csv")
rsna = pd.read_csv(rsna_base / "metadata_rsna.csv")

# Standardize label (1 = pneumonia, 0 = no pneumonia)
def standardize_labels(df, label_col, positive_terms=["PNEUMONIA"]):
    df["label"] = df[label_col].apply(lambda x: 1 if str(x).upper() in positive_terms else 0)
    return df

nih = standardize_labels(nih, "Finding")
chex = standardize_labels(chex, "Finding")
rsna = standardize_labels(rsna, "Finding")

# Add base_dir column
nih["base_dir"] = str(nih_base / "images")
chex["base_dir"] = str(chex_base / "images")
rsna["base_dir"] = str(rsna_base / "images")

# Select only necessary metadata columns
metadata_cols = config.METADATA_FEATURES + ["label", "base_dir", "image_id"]
nih = nih[metadata_cols]
chex = chex[metadata_cols]
rsna = rsna[metadata_cols]

# Combine or split as needed
combined = pd.concat([nih, chex], axis=0).sample(frac=1, random_state=config.SEED).reset_index(drop=True)


# Save processed CSVs

In [None]:
output_dir = Path("data/processed")
output_dir.mkdir(parents=True, exist_ok=True)
nih.to_csv(output_dir / "nih_processed.csv", index=False)
chex.to_csv(output_dir / "chexpert_processed.csv", index=False)
rsna.to_csv(output_dir / "rsna_processed.csv", index=False)
combined.to_csv(output_dir / "combined_train.csv", index=False)

print("Data preparation complete.")
print(f"Processed files saved to: {output_dir.resolve()}")