# Chest X-Ray Pneumonia — Colab Setup

This notebook demonstrates the end-to-end workflow for detecting pneumonia in chest X-ray images. It includes the following steps:

1. **Dataset Download**: Using KaggleHub to fetch the Chest X-Ray Pneumonia dataset.
2. **Data Preparation**: Preprocessing and creating TensorFlow datasets for training, validation, and testing.
3. **Model Training**: Leveraging transfer learning with EfficientNetB0 to classify images as Normal or Pneumonia.
4. **Evaluation**: Measuring the model's performance on the test dataset.
5. **Inference**: Using the trained model to predict the class of new X-ray images.

> **Note**: If KaggleHub authentication fails, a fallback Kaggle CLI method is provided. Ensure your credentials are secure and not committed to source control.

In [1]:
# Install required packages (Colab)
!pip -q install kagglehub tensorflow pillow numpy --upgrade

import os
from getpass import getpass

# Configure KaggleHub auth: paste your Kaggle API token when prompted
# Your token stays in the Colab session memory; it won't be committed.
os.environ["KAGGLE_API_TOKEN"] = getpass("Paste Kaggle API token: ")
print("KAGGLE_API_TOKEN set:", "***" if os.environ.get("KAGGLE_API_TOKEN") else "MISSING")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m854.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-text 2.19.0 requires tensorflow<2.20,>=2.19.0, but you have tensorflow 2.20.0 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.3.5

In [None]:
# Fallback: Kaggle CLI (if KaggleHub fails)
# 1) Create a kaggle.json with your username + key in the same cell if needed.
#    Replace <username> and <key> with your details. Do NOT commit this file.
# 2) Uncomment and run this cell only if KaggleHub errors.

# %%
# import json, os, pathlib
# pathlib.Path("/root/.kaggle").mkdir(parents=True, exist_ok=True)
# with open("/root/.kaggle/kaggle.json", "w") as f:
#     json.dump({"username": "<username>", "key": "<key>"}, f)
# os.chmod("/root/.kaggle/kaggle.json", 0o600)
# !pip -q install kaggle
# !kaggle datasets download -d paultimothymooney/chest-xray-pneumonia -p data
# !unzip -q data/chest-xray-pneumonia.zip -d data
# %ls -la data


In [2]:
# Download dataset via KaggleHub
import kagglehub

path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")
print("Path to dataset files:", path)

# Try to locate 'chest_xray' root inside the downloaded path
import os
from pathlib import Path

base = Path(path)
# Common layout in this dataset: a folder named 'chest_xray' with subfolders train/val/test
candidates = [p for p in base.rglob("chest_xray") if p.is_dir()]
if candidates:
    chest_xray_root = candidates[0]
else:
    chest_xray_root = base  # fallback

print("Using chest_xray_root:", chest_xray_root)
train_dir = chest_xray_root / "train"
val_dir = chest_xray_root / "val"
test_dir = chest_xray_root / "test"
print("train:", train_dir)
print("val:", val_dir)
print("test:", test_dir)

Using Colab cache for faster access to the 'chest-xray-pneumonia' dataset.
Path to dataset files: /kaggle/input/chest-xray-pneumonia
Using chest_xray_root: /kaggle/input/chest-xray-pneumonia/chest_xray
train: /kaggle/input/chest-xray-pneumonia/chest_xray/train
val: /kaggle/input/chest-xray-pneumonia/chest_xray/val
test: /kaggle/input/chest-xray-pneumonia/chest_xray/test


# Exploratory Data Analysis (EDA)

We inspect class distribution and visualize sample images to understand dataset characteristics and potential imbalance.

In [None]:
# Class distribution and sample visualization
import os
from pathlib import Path
import matplotlib.pyplot as plt

classes = []
counts = []
for cls in sorted([d.name for d in Path(train_dir).iterdir() if d.is_dir()]):
    classes.append(cls)
    counts.append(len(list((Path(train_dir)/cls).glob("*.*"))))

print({"classes": classes, "train_counts": counts})

# Show grid of sample images
def show_samples(root, cls, n=8):
    files = list((Path(root)/cls).glob("*.*"))[:n]
    cols = 4
    rows = (len(files) + cols - 1) // cols
    plt.figure(figsize=(12, 3*rows))
    for i, fp in enumerate(files):
        img = tf.keras.utils.load_img(fp)
        plt.subplot(rows, cols, i+1)
        plt.imshow(img, cmap='gray')
        plt.axis('off')
        plt.title(cls)
    plt.tight_layout()
    plt.show()

if classes:
    show_samples(train_dir, classes[0], n=8)
    if len(classes) > 1:
        show_samples(train_dir, classes[1], n=8)

In [3]:
# Build TensorFlow datasets
import tensorflow as tf
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    str(train_dir),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='binary'
)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    str(val_dir),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='binary'
)
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    str(test_dir),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='binary'
)

# Normalize to [0,1]
def norm(x, y):
    x = tf.cast(x, tf.float32) / 255.0
    return x, y

train_ds = train_ds.map(norm).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.map(norm).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.map(norm).prefetch(tf.data.AUTOTUNE)


Found 5216 files belonging to 2 classes.
Found 16 files belonging to 2 classes.
Found 624 files belonging to 2 classes.


In [4]:
# Define a simple CNN model
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(224, 224, 3)),
    layers.Conv2D(32, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(128, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [5]:
# Train the model
EPOCHS = 5
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

# Evaluate
test_loss, test_acc = model.evaluate(test_ds)
print({"test_loss": float(test_loss), "test_acc": float(test_acc)})

Epoch 1/5
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m557s[0m 3s/step - accuracy: 0.8093 - loss: 0.5227 - val_accuracy: 0.7500 - val_loss: 0.4150
Epoch 2/5
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m558s[0m 3s/step - accuracy: 0.9490 - loss: 0.1382 - val_accuracy: 0.9375 - val_loss: 0.2328
Epoch 3/5
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m566s[0m 3s/step - accuracy: 0.9615 - loss: 0.1073 - val_accuracy: 0.9375 - val_loss: 0.1586
Epoch 4/5
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m565s[0m 3s/step - accuracy: 0.9749 - loss: 0.0710 - val_accuracy: 0.8750 - val_loss: 0.2428
Epoch 5/5
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 3s/step - accuracy: 0.9755 - loss: 0.0664 - val_accuracy: 0.8750 - val_loss: 0.2195
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 1s/step - accuracy: 0.7272 - loss: 1.4202
{'test_loss': 1.5631732940673828, 'test_acc': 0.7211538553237915}


In [6]:
# Save the model (Colab)
model.save("pneumonia_model.h5")
print("Saved: pneumonia_model.h5")

# If running in Colab, download the file
try:
    from google.colab import files
    files.download("pneumonia_model.h5")
except Exception:
    pass




Saved: pneumonia_model.h5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>