> # **Load the required Dependencies**

****

In [1]:
import os
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

# Suppress warnings & TensorFlow logs
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TF INFO/WARN/ERROR
warnings.filterwarnings("ignore", category=FutureWarning)
np.seterr(all="ignore")

import tensorflow as tf

# Disable XLA JIT if not needed (optional)
tf.config.optimizer.set_jit(False)

E0000 00:00:1758994583.751668      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758994583.833578      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


> # **Kaggle Dataset Setup**

In [2]:
# If running in Kaggle, dataset CSV
DATA_DIR = "/kaggle/input/data"
CSV_PATH = "/kaggle/input/data/Data_Entry_2017.csv"

import os
import pandas as pd
from glob import glob

data = pd.read_csv(CSV_PATH)

# Find all PNG files in all subfolders of /kaggle/input
all_image_paths = glob("/kaggle/input/**/images*/**/*.png", recursive=True)
print("Total image files found:", len(all_image_paths))

name_to_path = {os.path.basename(p): p for p in all_image_paths}
data["path"] = data["Image Index"].map(name_to_path.get)

# Remove rows where path wasn't found
data = data.dropna(subset=["path"]).reset_index(drop=True)

# Convert age column safely
data["Patient Age"] = pd.to_numeric(data["Patient Age"], errors="coerce").astype("Int64")

print("Rows after filtering:", data.shape[0])
data.sample(3)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/data/Data_Entry_2017.csv'

> # **Basic EDA**

In [None]:
print("\nData_Entry_2017.csv\n")
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.nunique()

In [None]:
# ---- Split multi-label strings into individual diseases ----
all_diseases = []
for labels in data['Finding Labels']:
    for disease in labels.split('|'):
        all_diseases.append(disease.strip())

disease_series = pd.Series(all_diseases)
counts = disease_series.value_counts()

print("\n=== Class Distribution (Counts) ===\n")
for i, (disease, count) in enumerate(counts.items(), start=1):
    print(f"{i}. {disease} : {count}")

percentages = disease_series.value_counts(normalize=True) * 100

print("\n=== Class Distribution (Percentages) ===\n")
for i, (disease, pct) in enumerate(percentages.items(), start=1):
    print(f"{i}. {disease} : {pct:.2f}%")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Split multi-label into separate diseases
all_labels = data['Finding Labels'].str.split('|').explode()
disease_counts = all_labels.value_counts()
disease_percent = (disease_counts / len(data)) * 100

# Bar plot
plt.figure(figsize=(10,6))
sns.barplot(y=disease_counts.index, x=disease_counts.values, palette='viridis')
plt.xlabel("Count")
plt.ylabel("Disease")
plt.title("Disease Occurrence Count")
plt.show()

# Top-5 & rare
print("Top 5 diseases:\n", disease_counts.head(5))
print("\nRare diseases (count < 500):\n", disease_counts[disease_counts < 500])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Correct bins and labels
bins = [0, 2, 12, 18, 30, 50, 80, 120]  
labels = ['Infant', 'Child', 'Teen', 'Above 18', 'Adult', 'Mid-Age', 'Senior']

# Create Age Group column
data['Age Group'] = pd.cut(data['Patient Age'], bins=bins, labels=labels, right=False)

# Countplot
plt.figure(figsize=(12,6))
sns.countplot(data=data, x='Age Group', order=labels, hue='Patient Gender')
plt.title("Age Group vs Gender Distribution")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.legend(title="Gender")
plt.show()

In [None]:
# Co-occurrence heatmap
import itertools
from collections import Counter

diseases = sorted(set(all_labels))
co_matrix = pd.DataFrame(0, index=diseases, columns=diseases)

for labels in data['Finding Labels']:
    lbl_list = labels.split('|')
    for a, b in itertools.combinations(lbl_list, 2):
        co_matrix.loc[a, b] += 1
        co_matrix.loc[b, a] += 1

plt.figure(figsize=(8,6))
sns.heatmap(co_matrix, cmap="Reds")
plt.title("Disease Co-occurrence Heatmap")
plt.show()

In [None]:
# Gender split
gender_counts = data['Patient Gender'].value_counts()
plt.figure(figsize=(8,6))
sns.barplot(x=gender_counts.index, y=gender_counts.values, palette='coolwarm')
plt.title("Gender Distribution")
plt.show()

> # **Data Splitting**

**- Prevents cheating (overfitting):**
If the model saw the same images in both training and testing, it would just memorize answers, not truly learn.

**- Checks generalization:**
Validation and test sets tell us if the model can handle new data it has never seen before.

**- Hyperparameter tuning:**
Validation helps us pick the best model settings (like learning rate, batch size).

In [None]:
# --- Step 1: Filter dataset for frontal X-rays only ---
frontal_df = data[data['View Position'].isin(['PA', 'AP'])].copy()

print(f"Total frontal images: {len(frontal_df)}")
print(f"Unique patients after filtering: {frontal_df['Patient ID'].nunique()}")

# --- Step 2: Patient-level split (to avoid leakage!) ---
from sklearn.model_selection import train_test_split

unique_patients = frontal_df['Patient ID'].unique()

train_patients, valid_patients = train_test_split(
    unique_patients,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

In [None]:
# Assign splits based on patient ID
train_df = frontal_df[frontal_df['Patient ID'].isin(train_patients)]
valid_df = frontal_df[frontal_df['Patient ID'].isin(valid_patients)]

print("Train size:", len(train_df), "Validation size:", len(valid_df))
print("Unique train patients:", train_df['Patient ID'].nunique())
print("Unique val patients:", valid_df['Patient ID'].nunique())

> # **Data-Set Preprocessing**

In [None]:
data['Finding Labels'] = data['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        data[c_label] = data['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
data.sample(3)

In [None]:
# Extract disease classes
all_labels = sorted(set([c for sublist in data['Finding Labels'].str.split('|') for c in sublist]))
print("Disease Labels:", all_labels)

# Create multi-hot encoded targets
for label in all_labels:
    data[label] = data['Finding Labels'].apply(lambda x: 1 if label in x else 0)

# Optional: remove "No Finding" column from targets
if "No Finding" in all_labels:
    all_labels.remove("No Finding")

print("\nTargets shape:", data[all_labels].shape)
data[all_labels].head(3)

**Step 1: Grayscale Conversion + Normalization**

In [None]:
def preprocess_xray_image(image, target_size=(224, 224), apply_equalize=True, 
                          clahe_clip=2.0, clahe_tile=(8, 8)):
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Resize
    image = cv2.resize(image, target_size, interpolation=cv2.INTER_AREA)
    
    # Histogram equalization
    if apply_equalize:
        image = cv2.equalizeHist(image)
    
    # CLAHE for local contrast
    clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=clahe_tile)
    image = clahe.apply(image)
    
    # Normalize
    image = image.astype(np.float32) / 255.0
    
    """
    # After CLAHE and normalize
    image = tf.image.rgb_to_grayscale(image)
    image = tf.image.grayscale_to_rgb(image)  
    
    """
    
    return image

****Step 2: Batch Preprocessing (CPU side)****

In [None]:
def preprocess_xray_batch(images, target_size=(224, 224)):
    """
    Preprocesses a batch of grayscale images (numpy arrays).
    Returns (N, H, W, 1).
    """
    processed = []
    for img in images:
        pre = preprocess_xray_image(img, target_size=target_size)
        processed.append(pre)
    return np.expand_dims(np.stack(processed, axis=0), -1)

**Step 3: CLAHE inside tf.data pipeline**

In [None]:
def apply_clahe_tf(image):
    """Applies CLAHE using OpenCV inside tf.data via tf.py_function."""
    def _clahe_fn(img_np):
        img_np = img_np.squeeze().astype(np.uint8)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        img_np = clahe.apply(img_np)
        return img_np.astype(np.float32) / 255.0
    
    clahe_img = tf.py_function(_clahe_fn, [image], tf.float32)
    clahe_img.set_shape((image.shape[0], image.shape[1]))  # restore shape
    clahe_img = tf.expand_dims(clahe_img, axis=-1)  # channel
    return clahe_img

**Step 4: Albumentations Augmentations (CPU)**

In [None]:
import albumentations as A
import numpy as np

IMG_SIZE = 224   # standard input for DenseNet121
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

# --- Albumentations setup ---
albumentation_transform = A.Compose([
    A.HorizontalFlip(p=0.5),
       A.Affine(
        scale=(0.9, 1.1),  # Equivalent to scale_limit=(-0.1, 0.1)
        rotate=(-10, 10),  # Equivalent to rotate_limit=(-10, 10)
        translate_percent={'x': (-0.05, 0.05), 'y': (-0.05, 0.05)}, # Equivalent to shift_limit=0.05
        p=0.5
    ),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=10, p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.5),
    A.CLAHE(clip_limit=2.0, tile_grid_size=(8,8), p=0.3),
    A.CropAndPad(percent=(-0.05, 0.1), p=0.3),
    A.GaussNoise(p=0.3),
])

def apply_albumentations(image):
    """Apply Albumentations (expects numpy)."""
    image = np.array(image)
    augmented = albumentation_transform(image=image)
    return augmented["image"].astype(np.float32)

**Step 5: Preprocess + Augmentation**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

IMG_SIZE = 224
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

# Keras augmentations (GPU-friendly)
keras_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),             # left/right flip only
    layers.RandomRotation(0.05),                 # ~±10 degrees
    layers.RandomZoom(0.1),                      # zoom in/out 10%
    layers.RandomTranslation(0.05, 0.05),        # shift up to 5%
    layers.RandomContrast(0.1),                  # contrast ±10%
    layers.Lambda(lambda x: tf.image.random_brightness(x, max_delta=0.05))
])

def preprocess_and_augment(path, label, training=False):
    # Load image
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, (224, 224))
    image = tf.cast(image, tf.float32) / 255.0

    # Augment only if training
    if training:
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_brightness(image, max_delta=0.1)
        image = tf.image.random_contrast(image, 0.9, 1.1)

    return image, label

**Step 6: Dataset Check Functions**

In [None]:
def check_dataset_balance(df, label_col="disease_vec"):
    """
    Quick class balance check.
    """
    counts = df[label_col].sum(axis=0) if isinstance(df[label_col].iloc[0], (np.ndarray, list)) else df[label_col].value_counts()
    print("Class distribution:\n", counts)

def ensure_patient_split(df, patient_col="Patient ID", test_size=0.2):
    """
    Splits dataset ensuring patients don't leak between train/val.
    """
    from sklearn.model_selection import GroupShuffleSplit
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    train_idx, val_idx = next(gss.split(df, groups=df[patient_col]))
    return df.iloc[train_idx], df.iloc[val_idx]

**Step 7: Encoding to Full dataset**

In [None]:
import numpy as np

def encode_labels(label_string):
    """Convert 'Finding Labels' string -> multi-hot vector"""
    labels = label_string.split("|")
    vec = np.zeros(len(all_labels), dtype="float32")
    for l in labels:
        if l in all_labels:
            vec[all_labels.index(l)] = 1.0
    return vec

# Apply encoding to full dataset
data["label_vec"] = data["Finding Labels"].apply(encode_labels)

# Now split into train/valid again (patient-level if you want)
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print("Train size:", len(train_df))
print("Validation size:", len(valid_df))
print("Example label_vec shape:", train_df["label_vec"].iloc[0].shape)

In [None]:
import tensorflow as tf

# --- Constants ---
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 32   # or adjust as per GPU memory

# --- Dataset builder ---
def make_dataset(df, training=True):
    paths = df["path"].values
    labels = np.stack(df["label_vec"].values).astype("float32")

    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    if training:
        ds = ds.shuffle(buffer_size=1024)

    ds = ds.map(lambda x, y: preprocess_and_augment(x, y, training=training),
                num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

# --- Create datasets ---
train_ds = make_dataset(train_df, training=True)
val_ds   = make_dataset(valid_df, training=False)

# --- Quick sanity check ---
for img, lbl in train_ds.take(1):
    print(img.shape, lbl.shape)

> # **Model Training**

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

# Load base DenseNet121
base_model = DenseNet121(weights="imagenet", include_top=False, input_shape=(224,224,3))

# Freeze base initially
base_model.trainable = False

# Build classifier head
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)               # stabilizes training
x = Dropout(0.5)(x)                       # strong dropout for overfitting
x = Dense(512, activation="relu")(x)      # extra dense layer
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
output = Dense(len(all_labels), activation="sigmoid")(x)

model = Model(inputs=base_model.input, outputs=output)

# Compile with label smoothing + AUC metric
model.compile(
    optimizer=Adam(learning_rate=1e-4),
    loss=BinaryCrossentropy(label_smoothing=0.05),  # ✅ prevents overconfident outputs
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc"), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

model.summary()

# --- Callbacks ---
callbacks = [
    tf.keras.callbacks.ModelCheckpoint("best_model.h5", monitor="val_auc", save_best_only=True, mode="max"),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.3, patience=3, verbose=1, min_lr=1e-7),
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=7, restore_best_weights=True)
]

# --- Training Warm-up (only top layers) ---
history1 = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
    callbacks=callbacks
)

# --- Fine-tuning (unfreeze deeper layers) ---
base_model.trainable = True
for layer in base_model.layers[:-50]:   # keep early layers frozen
    layer.trainable = False

model.compile(
    optimizer=Adam(learning_rate=1e-5),
    loss=BinaryCrossentropy(label_smoothing=0.05),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc"), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

history2 = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=callbacks
)

> # **Generate Grad-CAM and Draw Boxes**

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model

def get_gradcam(model, img_array, class_index, last_conv_layer='conv5_block16_concat'):
    """
    Generate Grad-CAM heatmap
    """
    grad_model = Model([model.inputs], [model.get_layer(last_conv_layer).output, model.output])
    
    with tf.GradientTape() as tape:
        conv_outputs, predictions = grad_model(img_array)
        loss = predictions[:, class_index]
    
    grads = tape.gradient(loss, conv_outputs)
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
    
    conv_outputs = conv_outputs[0]
    heatmap = tf.reduce_sum(tf.multiply(pooled_grads, conv_outputs), axis=-1)
    heatmap = np.maximum(heatmap, 0)
    heatmap /= np.max(heatmap) + 1e-8
    heatmap = cv2.resize(heatmap.numpy(), (img_array.shape[2], img_array.shape[1]))
    return heatmap

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force CPU if needed

import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.applications import DenseNet121

# ------------------ Load Model ------------------ #
model = DenseNet121(weights=None, input_shape=(224, 224, 3), classes=2)

# ------------------ Grad-CAM ------------------ #
def get_gradcam(model, img_array, class_index=0, last_conv_layer='conv5_block16_concat'):
    grad_model = Model(model.input, 
                       [model.get_layer(last_conv_layer).output, model.output])

    with tf.GradientTape() as tape:
        conv_outputs, predictions = grad_model(img_array)
        loss = predictions[:, class_index]

    grads = tape.gradient(loss, conv_outputs)
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))

    conv_outputs = conv_outputs[0]  # (H, W, C)
    heatmap = tf.reduce_sum(tf.multiply(pooled_grads, conv_outputs), axis=-1).numpy()

    heatmap = np.maximum(heatmap, 0)
    heatmap /= np.max(heatmap) + 1e-8
    return cv2.resize(heatmap, (img_array.shape[2], img_array.shape[1]))

# ------------------ Overlay Heatmap + Boxes ------------------ #
def overlay_heatmap(img, heatmap, alpha=0.4, threshold=0.5):
    heatmap_colored = cv2.applyColorMap(np.uint8(255 * heatmap), cv2.COLORMAP_JET)
    overlayed = cv2.addWeighted(heatmap_colored, alpha, img, 1 - alpha, 0)

    binary_map = np.uint8(heatmap > threshold) * 255
    contours, _ = cv2.findContours(binary_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        cv2.rectangle(overlayed, (x, y), (x + w, y + h), (0, 0, 255), 2)

    return overlayed

# ------------------ Run Example ------------------ #
img_paths = [
    "/kaggle/input/data/images_001/images/00000003_003.png",
    "/kaggle/input/data/images_001/images/00000005_001.png"
    # removed the broken 00000009_006.png
]

plt.figure(figsize=(12, 6))
for i, img_path in enumerate(img_paths, 1):
    img = cv2.imread(img_path)
    if img is None:
        print(f"Could not load image: {img_path}")
        continue

    img_resized = cv2.resize(img, (224, 224))
    img_array = np.expand_dims(img_resized / 255.0, axis=0)

    heatmap = get_gradcam(model, img_array)
    result = overlay_heatmap(img_resized, heatmap)

    plt.subplot(1, len(img_paths), i)
    plt.imshow(cv2.cvtColor(result, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.title(f"Image {i}")

plt.show()