In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# 1. Parameters & Paths
CSV_PATH = "/kaggle/input/xray-dataset/xray_dataset/chest_xray.csv"          # CSV file with metadata
IMAGES_DIR = "/kaggle/input/xray-dataset/xray_dataset/images"                # Folder containing X-ray images
IMG_SIZE = 224                       # Image dimensions: 224x224
BATCH_SIZE = 32
PATCH_SIZE = 16                      # Size for patch extraction (e.g., 16x16 patches)
NUM_CLASSES = 2                      # Binary classification: 'No Finding' vs 'With findings'
EPOCHS = 5                          # Set training epochs

# 2. Load CSV and Prepare Dataset
df = pd.read_csv(CSV_PATH)

# Create binary label: 0 if "No Finding", 1 if any other label.
df["label"] = df["Finding Labels"].apply(
    lambda x: 0 if x.strip().lower() == "no finding" else 1
)

# Use stratified splitting to maintain class distribution.
df_train, df_temp = train_test_split(df, test_size=0.30, stratify=df["label"], random_state=42)
# Split temp into validation (approx. 10%) and test (approx. 20%)
val_size = 1/3
df_val, df_test = train_test_split(df_temp, test_size=(1 - val_size), stratify=df_temp["label"], random_state=42)

print("Dataset split:")
print("Train samples:", len(df_train))
print("Validation samples:", len(df_val))
print("Test samples:", len(df_test))

# 3. Define a function to load and preprocess images.
def load_and_preprocess_image(image_filename, label):
    # Use tf.strings.join instead of os.path.join since image_filename is a symbolic tensor
    image_path = tf.strings.join([IMAGES_DIR, image_filename], separator=os.sep)
    # Read the image file.
    image = tf.io.read_file(image_path)
    # Decode PNG image as 3-channel RGB.
    image = tf.image.decode_png(image, channels=3)
    # Resize the image to 224x224.
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    # Normalize pixel values to [0, 1].
    image = image / 255.0
    return image, label

# 4. Create TensorFlow Dataset objects for train, validation, and test splits.
def create_dataset(df):
    filenames = df["Image Index"].values
    labels = df["label"].values
    ds = tf.data.Dataset.from_tensor_slices((filenames, labels))
    ds = ds.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    return ds

ds_train = create_dataset(df_train).shuffle(buffer_size=len(df_train)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
ds_val   = create_dataset(df_val).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
ds_test  = create_dataset(df_test).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# 5. Define Custom Layers for the Vision Transformer (ViT)

# 5a. Patch Extraction Layer: splits the image into patches.
class PatchExtractor(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        # Each patch is flattened into a vector.
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

# 5b. Patch Encoder Layer: projects patches and adds positional embeddings.
class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(input_dim=num_patches, output_dim=projection_dim)

    def call(self, patches):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        # Project each patch and add its corresponding positional embedding.
        encoded = self.projection(patches) + self.position_embedding(positions)
        return encoded

# 5c. Transformer Encoder Block: includes multi-head self-attention and MLP.
def transformer_encoder(inputs, projection_dim, num_heads, transformer_units, dropout_rate):
    # Normalization and self-attention.
    x1 = layers.LayerNormalization(epsilon=1e-6)(inputs)
    attention_output = layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=projection_dim, dropout=dropout_rate
    )(x1, x1)
    x2 = layers.Add()([attention_output, inputs])

    # Feed-forward network.
    x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
    x3 = layers.Dense(transformer_units, activation=tf.nn.gelu)(x3)
    x3 = layers.Dropout(dropout_rate)(x3)
    x3 = layers.Dense(projection_dim)(x3)
    encoded = layers.Add()([x3, x2])
    return encoded

# 6. Build the Vision Transformer Model
def create_vit_classifier(
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    patch_size=PATCH_SIZE,
    num_layers=8,                 # Number of transformer blocks.
    projection_dim=64,            # Embedding dimension.
    num_heads=4,                  # Number of attention heads.
    transformer_units=128,        # MLP hidden dimension.
    mlp_head_units=[128, 64],     # Dense layers before classification.
    dropout_rate=0.1,
):
    inputs = layers.Input(shape=input_shape)
    
    # Create patches from the input image.
    patches = PatchExtractor(patch_size)(inputs)
    num_patches = (IMG_SIZE // patch_size) ** 2  # For 224x224 with 16x16 patches -> 196 patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)
    
    # Add transformer blocks.
    x = encoded_patches
    for _ in range(num_layers):
        x = transformer_encoder(x, projection_dim, num_heads, transformer_units, dropout_rate)
    
    # Pool the outputs of the transformer: use global average pooling.
    representation = layers.GlobalAveragePooling1D()(x)
    representation = layers.LayerNormalization(epsilon=1e-6)(representation)
    
    # MLP head.
    for units in mlp_head_units:
        representation = layers.Dense(units, activation=tf.nn.gelu)(representation)
        representation = layers.Dropout(dropout_rate)(representation)
    
    # Final classification layer.
    logits = layers.Dense(NUM_CLASSES, activation="softmax")(representation)
    
    model = keras.Model(inputs=inputs, outputs=logits)
    return model

# Create the ViT model.
vit_model = create_vit_classifier()
vit_model.summary()

# 7. Compile and Train the Model
vit_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

history = vit_model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=EPOCHS,
)

# 8. Evaluate the Model using Classification Metrics
# Evaluate on the test set.
test_loss, test_accuracy = vit_model.evaluate(ds_test)
print(f"\nTest Loss: {test_loss:.4f}  Test Accuracy: {test_accuracy:.4f}")

# Predict labels on the test set.
y_pred_probs = vit_model.predict(ds_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Gather the true labels from ds_test.
y_true = []
for _, labels in ds_test:
    y_true.extend(labels.numpy())
y_true = np.array(y_true)

# Print classification report.
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["No Finding", "With Findings"]))

# Display the confusion matrix.
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)


Dataset split:
Train samples: 2034
Validation samples: 290
Test samples: 582


Epoch 1/20


I0000 00:00:1744439983.124116      88 service.cc:148] XLA service 0x7bc50c001ee0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744439983.124836      88 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1744439986.520007      88 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 2/64[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 51ms/step - accuracy: 0.4844 - loss: 0.7474    

I0000 00:00:1744440004.137750      88 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 438ms/step - accuracy: 0.5035 - loss: 0.7203 - val_accuracy: 0.5172 - val_loss: 0.6936
Epoch 2/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 64ms/step - accuracy: 0.5026 - loss: 0.6954 - val_accuracy: 0.4828 - val_loss: 0.6986
Epoch 3/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 81ms/step - accuracy: 0.4790 - loss: 0.7006 - val_accuracy: 0.5103 - val_loss: 0.6920
Epoch 4/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 65ms/step - accuracy: 0.5095 - loss: 0.6949 - val_accuracy: 0.4828 - val_loss: 0.6942
Epoch 5/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 64ms/step - accuracy: 0.5189 - loss: 0.6926 - val_accuracy: 0.4828 - val_loss: 0.6956
Epoch 6/20
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 65ms/step - accuracy: 0.5053 - loss: 0.6967 - val_accuracy: 0.5034 - val_loss: 0.6893
Epoch 7/20
[1m64/64[0m [32m━━━━━━━━

In [None]:
# 1. Parameters & Paths
CSV_PATH = "/kaggle/input/xray-dataset/xray_dataset/chest_xray.csv"          # CSV file with metadata
IMAGES_DIR = "/kaggle/input/xray-dataset/xray_dataset/images"                # Folder containing X-ray images