In [1]:
import os, json, yaml, random, sys, pickle
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score, f1_score

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# ----------------------------- JSON-SAFE CONVERSION -----------------------------
def to_py(obj):
    """Recursively convert numpy/tensorflow types to plain Python so json/yaml dump works."""
    import numpy as _np
    try:
        import tensorflow as _tf
    except Exception:
        _tf = None

    if isinstance(obj, dict):
        return {k: to_py(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [to_py(v) for v in obj]
    if isinstance(obj, _np.generic):
        return obj.item()
    if isinstance(obj, _np.ndarray):
        return obj.tolist()
    if _tf is not None and isinstance(obj, _tf.Tensor):
        return obj.numpy().tolist()
    return obj

# ----------------------------- USER PATHS (exact, as given) -----------------------------
BASE_DIR = Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection")
ARTIFACTS = BASE_DIR / "artifacts"
ARTIFACTS.mkdir(parents=True, exist_ok=True)

DATA_PATHS = {
    "Viral Pneumonia": {
        "images": Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\Viral Pneumonia\images"),
        "masks" : Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\Viral Pneumonia\masks"),
    },
    "Normal": {
        "images": Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\Normal\images"),
        "masks" : Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\Normal\masks"),
    },
    "Lung_Opacity": {
        "images": Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\Lung_Opacity\images"),
        "masks" : Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\Lung_Opacity\masks"),
    },
    "COVID": {
        "images": Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\COVID\images"),
        "masks" : Path(r"C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\archive\COVID-19_Radiography_Dataset\COVID\masks"),
    },
}

# ----------------------------- CONFIG (will also be saved as YAML) -----------------------------
CFG = {
    "seed": 42,
    "image_size": 224,
    "batch_size": 24,
    "epochs": 15,
    "val_split": 0.1,   # after creating test split
    "test_split": 0.1,  # from full dataset
    "augment": True,
    "model": {
        "backbone": "DenseNet121",
        "train_base": False,
        "lr": 2e-4
    },
    "paths": {k: {"images": str(v["images"]), "masks": str(v["masks"])} for k, v in DATA_PATHS.items()},
    "artifacts_dir": str(ARTIFACTS),
}

# ----------------------------- SEEDING -----------------------------
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(CFG["seed"])

# ----------------------------- SCAN FILES -----------------------------
def list_images_in(folder: Path):
    if not folder.exists():
        return []
    exts = {".jpg", ".jpeg", ".png", ".bmp"}
    return [str(p) for p in folder.rglob("*") if p.suffix.lower() in exts]

rows = []
for label, d in DATA_PATHS.items():
    imgs = list_images_in(d["images"])
    for p in imgs:
        rows.append({"path": p, "label": label})

if len(rows) == 0:
    print("[ERROR] No images found in the provided paths.")
    sys.exit(1)

df = pd.DataFrame(rows).sample(frac=1.0, random_state=CFG["seed"]).reset_index(drop=True)
print(f"[INFO] Found {len(df)} images across classes:", df["label"].value_counts().to_dict())

# ----------------------------- SPLITS -----------------------------
# First carve out test set
df_trainval, df_test = train_test_split(
    df, test_size=CFG["test_split"], stratify=df["label"], random_state=CFG["seed"]
)
# Then split val from trainval
df_train, df_val = train_test_split(
    df_trainval, test_size=CFG["val_split"], stratify=df_trainval["label"], random_state=CFG["seed"]
)

print(f"[SPLIT] train={len(df_train)}  val={len(df_val)}  test={len(df_test)}")

# ----------------------------- LABEL ENCODER -----------------------------
le = LabelEncoder()
le.fit(df["label"])
df_train = df_train.copy(); df_val = df_val.copy(); df_test = df_test.copy()
df_train["y"] = le.transform(df_train["label"])
df_val["y"]   = le.transform(df_val["label"])
df_test["y"]  = le.transform(df_test["label"])

class_names = list(le.classes_)
num_classes = len(class_names)
print(f"[INFO] Classes ({num_classes}): {class_names}")

# Save label encoder
with open(ARTIFACTS / "label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Save class index mapping
class_indices = {c: int(i) for i, c in enumerate(class_names)}
with open(ARTIFACTS / "class_indices.json", "w") as f:
    json.dump(to_py(class_indices), f, indent=2)

# ----------------------------- TF.DATA PIPELINES -----------------------------
IMG_SIZE = CFG["image_size"]
AUTO = tf.data.AUTOTUNE

def decode_image(path):
    img = tf.io.read_file(path)
    img = tf.io.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.convert_image_dtype(img, tf.float32)  # [0,1]
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    return img

def augment(img):
    # light augmentations suitable for CXR
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_brightness(img, max_delta=0.05)
    img = tf.image.random_contrast(img, 0.95, 1.05)
    return img

def preprocess(path, y, training=False):
    img = decode_image(path)
    if training and CFG["augment"]:
        img = augment(img)
    # Imagenet normalization
    img = (img - tf.constant([0.485, 0.456, 0.406])) / tf.constant([0.229, 0.224, 0.225])
    return img, y

def make_ds(paths, ys, training=False, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices((paths, ys))
    if training: ds = ds.shuffle(buffer_size=len(paths), seed=CFG["seed"], reshuffle_each_iteration=True)
    ds = ds.map(lambda p, y: preprocess(p, y, training=training), num_parallel_calls=AUTO)
    ds = ds.batch(batch_size).prefetch(AUTO)
    return ds

train_ds = make_ds(df_train["path"].values, df_train["y"].values, training=True,  batch_size=CFG["batch_size"])
val_ds   = make_ds(df_val["path"].values,   df_val["y"].values,   training=False, batch_size=CFG["batch_size"])
test_ds  = make_ds(df_test["path"].values,  df_test["y"].values,  training=False, batch_size=CFG["batch_size"])

# ----------------------------- MODEL -----------------------------
def build_model(num_classes: int, train_base: bool = False, lr: float = 2e-4):
    base = tf.keras.applications.DenseNet121(
        include_top=False, weights="imagenet", input_shape=(IMG_SIZE, IMG_SIZE, 3)
    )
    base.trainable = train_base  # freeze base by default

    inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    x = inputs
    x = base(x, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)
    model = models.Model(inputs, outputs)
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

model = build_model(num_classes, train_base=CFG["model"]["train_base"], lr=CFG["model"]["lr"])
model.summary()

# ----------------------------- TRAIN -----------------------------
ckpt_path = ARTIFACTS / "model.h5"
callbacks = [
    EarlyStopping(monitor="val_accuracy", patience=4, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6),
    ModelCheckpoint(filepath=str(ckpt_path), monitor="val_accuracy", save_best_only=True, save_weights_only=False),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=CFG["epochs"],
    callbacks=callbacks,
    verbose=1,
)

# Save history (JSON + CSV) — JSON-safe
hist_py = to_py(history.history)
with open(ARTIFACTS / "history.json", "w") as f:
    json.dump(hist_py, f, indent=2)
pd.DataFrame(hist_py).to_csv(ARTIFACTS / "history.csv", index=False)

# Ensure best model saved
model.save(ckpt_path)

# ----------------------------- EVALUATION -----------------------------
# Validation metrics
val_probs = model.predict(val_ds, verbose=0)
val_preds = np.argmax(val_probs, axis=1)
val_true  = df_val["y"].values
val_acc   = accuracy_score(val_true, val_preds)
val_f1    = f1_score(val_true, val_preds, average="macro")

# Test metrics
test_probs = model.predict(test_ds, verbose=0)
test_preds = np.argmax(test_probs, axis=1)
test_true  = df_test["y"].values
test_acc   = accuracy_score(test_true, test_preds)
test_f1    = f1_score(test_true, test_preds, average="macro")

# One-vs-rest AUROC (gracefully handle missing classes)
def safe_ovr_auc(y_true, probs, n_classes):
    y_true_oh = tf.keras.utils.to_categorical(y_true, num_classes=n_classes)
    out = {}
    for i, name in enumerate(class_names):
        try:
            auc = roc_auc_score(y_true_oh[:, i], probs[:, i])
        except Exception:
            auc = float("nan")
        out[name] = float(auc)
    macro = float(np.nanmean([v for v in out.values()]))
    return out, macro

val_auc_per_class, val_auc_macro = safe_ovr_auc(val_true, val_probs, num_classes)
test_auc_per_class, test_auc_macro = safe_ovr_auc(test_true, test_probs, num_classes)

# Classification report & confusion matrix (test)
report = classification_report(test_true, test_preds, target_names=class_names, output_dict=True)
cm = confusion_matrix(test_true, test_preds).tolist()

metrics = {
    "datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "classes": class_names,
    "val": {
        "accuracy": float(val_acc),
        "f1_macro": float(val_f1),
        "auc_macro_ovr": float(val_auc_macro),
        "auc_per_class": val_auc_per_class,
    },
    "test": {
        "accuracy": float(test_acc),
        "f1_macro": float(test_f1),
        "auc_macro_ovr": float(test_auc_macro),
        "auc_per_class": test_auc_per_class,
        "classification_report": report,
        "confusion_matrix": cm,
    },
}

with open(ARTIFACTS / "metrics.json", "w") as f:
    json.dump(to_py(metrics), f, indent=2)

with open(ARTIFACTS / "confusion_matrix.json", "w") as f:
    json.dump(to_py({"labels": class_names, "matrix": cm}), f, indent=2)

# Save predictions for test set
test_paths = df_test["path"].values
pred_rows = []
for p, y_true_i, y_pred_i, prob_vec in zip(test_paths, test_true, test_preds, test_probs):
    pred_rows.append({
        "path": p,
        "true_label": class_names[int(y_true_i)],
        "pred_label": class_names[int(y_pred_i)],
        **{f"prob_{class_names[i]}": float(prob_vec[i]) for i in range(num_classes)}
    })
pd.DataFrame(pred_rows).to_csv(ARTIFACTS / "predictions_test.csv", index=False)

# ----------------------------- SAVE CONFIG -----------------------------
CFG_SAVE = {
    **CFG,
    "classes": class_names,
    "splits": {
        "train": int(len(df_train)),
        "val": int(len(df_val)),
        "test": int(len(df_test)),
    },
    "artifacts": {
        "model_h5": str(ckpt_path),
        "label_encoder_pkl": str(ARTIFACTS / "label_encoder.pkl"),
        "class_indices_json": str(ARTIFACTS / "class_indices.json"),
        "metrics_json": str(ARTIFACTS / "metrics.json"),
        "history_json": str(ARTIFACTS / "history.json"),
        "history_csv": str(ARTIFACTS / "history.csv"),
        "predictions_csv": str(ARTIFACTS / "predictions_test.csv"),
        "confusion_matrix_json": str(ARTIFACTS / "confusion_matrix.json"),
    },
}
with open(ARTIFACTS / "config.yaml", "w") as f:
    yaml.safe_dump(to_py(CFG_SAVE), f, sort_keys=False)

print("\n[DONE] Artifacts saved in:", ARTIFACTS)
for k, v in CFG_SAVE["artifacts"].items():
    print(f"  - {k}: {v}")



[INFO] Found 21165 images across classes: {'Normal': 10192, 'Lung_Opacity': 6012, 'COVID': 3616, 'Viral Pneumonia': 1345}
[SPLIT] train=17143  val=1905  test=2117
[INFO] Classes (4): ['COVID', 'Lung_Opacity', 'Normal', 'Viral Pneumonia']


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 densenet121 (Functional)    (None, 7, 7, 1024)        7037504   
                                                                 
 global_average_pooling2d (  (None, 1024)              0         
 GlobalAveragePooling2D)                                         
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense (Dense)    

  saving_api.save_model(


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

[DONE] Artifacts saved in: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts
  - model_h5: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts\model.h5
  - label_encoder_pkl: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts\label_encoder.pkl
  - class_indices_json: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts\class_indices.json
  - metrics_json: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts\metrics.json
  - history_json: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts\history.json
  - history_csv: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts\history.csv
  - predictions_csv: C:\Users\NXTWAVE\Downloads\COVID Radiography Detection\artifacts\predictions_test.csv
  - confusion_matrix_json: C:\Users\NXTWAVE\Download