# Pneumonia Detection - Data Exploration

This notebook explores the chest X-ray dataset for pneumonia detection.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from collections import Counter

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Dataset Overview

In [None]:
# Dataset paths
data_dir = '../data/chest_xray'
train_dir = os.path.join(data_dir, 'train')
val_dir = os.path.join(data_dir, 'val')
test_dir = os.path.join(data_dir, 'test')

# Count images in each category
def count_images(directory):
    counts = {}
    for category in ['NORMAL', 'PNEUMONIA']:
        category_path = os.path.join(directory, category)
        if os.path.exists(category_path):
            counts[category] = len(os.listdir(category_path))
        else:
            counts[category] = 0
    return counts

train_counts = count_images(train_dir)
val_counts = count_images(val_dir)
test_counts = count_images(test_dir)

print("Dataset Distribution:")
print(f"Training: {train_counts}")
print(f"Validation: {val_counts}")
print(f"Testing: {test_counts}")

In [None]:
# Visualize dataset distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

datasets = [('Training', train_counts), ('Validation', val_counts), ('Testing', test_counts)]

for i, (name, counts) in enumerate(datasets):
    categories = list(counts.keys())
    values = list(counts.values())
    
    axes[i].bar(categories, values, color=['skyblue', 'lightcoral'])
    axes[i].set_title(f'{name} Set Distribution')
    axes[i].set_ylabel('Number of Images')
    
    # Add value labels on bars
    for j, v in enumerate(values):
        axes[i].text(j, v + 10, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Sample Images Visualization

In [None]:
def display_sample_images(data_dir, num_samples=4):
    """Display sample images from each category"""
    fig, axes = plt.subplots(2, num_samples, figsize=(16, 8))
    
    categories = ['NORMAL', 'PNEUMONIA']
    
    for i, category in enumerate(categories):
        category_path = os.path.join(data_dir, category)
        image_files = os.listdir(category_path)[:num_samples]
        
        for j, image_file in enumerate(image_files):
            img_path = os.path.join(category_path, image_file)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            
            axes[i, j].imshow(img, cmap='gray')
            axes[i, j].set_title(f'{category}\n{image_file}')
            axes[i, j].axis('off')
    
    plt.suptitle('Sample Chest X-Ray Images', fontsize=16)
    plt.tight_layout()
    plt.show()

display_sample_images(train_dir)

## Image Properties Analysis

In [None]:
def analyze_image_properties(data_dir, sample_size=100):
    """Analyze image dimensions and properties"""
    properties = {'category': [], 'width': [], 'height': [], 'aspect_ratio': [], 'file_size': []}
    
    for category in ['NORMAL', 'PNEUMONIA']:
        category_path = os.path.join(data_dir, category)
        image_files = os.listdir(category_path)[:sample_size]
        
        for image_file in image_files:
            img_path = os.path.join(category_path, image_file)
            img = cv2.imread(img_path)
            
            if img is not None:
                h, w = img.shape[:2]
                file_size = os.path.getsize(img_path) / 1024  # KB
                
                properties['category'].append(category)
                properties['width'].append(w)
                properties['height'].append(h)
                properties['aspect_ratio'].append(w/h)
                properties['file_size'].append(file_size)
    
    return pd.DataFrame(properties)

# Analyze properties
df_props = analyze_image_properties(train_dir)
print("Image Properties Summary:")
print(df_props.groupby('category').describe())

In [None]:
# Visualize image properties
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Width distribution
sns.boxplot(data=df_props, x='category', y='width', ax=axes[0, 0])
axes[0, 0].set_title('Image Width Distribution')

# Height distribution
sns.boxplot(data=df_props, x='category', y='height', ax=axes[0, 1])
axes[0, 1].set_title('Image Height Distribution')

# Aspect ratio
sns.boxplot(data=df_props, x='category', y='aspect_ratio', ax=axes[1, 0])
axes[1, 0].set_title('Aspect Ratio Distribution')

# File size
sns.boxplot(data=df_props, x='category', y='file_size', ax=axes[1, 1])
axes[1, 1].set_title('File Size Distribution (KB)')

plt.tight_layout()
plt.show()

## Image Preprocessing Comparison

In [None]:
def show_preprocessing_steps(image_path):
    """Show preprocessing steps on a sample image"""
    # Read original image
    img_original = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Resize
    img_resized = cv2.resize(img_original, (224, 224))
    
    # Apply CLAHE
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    img_clahe = clahe.apply(img_resized)
    
    # Normalize
    img_normalized = img_clahe.astype(np.float32) / 255.0
    
    # Display all steps
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    
    images = [img_original, img_resized, img_clahe, img_normalized]
    titles = ['Original', 'Resized (224x224)', 'CLAHE Applied', 'Normalized']
    
    for i, (img, title) in enumerate(zip(images, titles)):
        axes[i].imshow(img, cmap='gray')
        axes[i].set_title(title)
        axes[i].axis('off')
    
    plt.suptitle('Image Preprocessing Pipeline', fontsize=16)
    plt.tight_layout()
    plt.show()

# Show preprocessing for normal and pneumonia images
normal_img = os.path.join(train_dir, 'NORMAL', os.listdir(os.path.join(train_dir, 'NORMAL'))[0])
pneumonia_img = os.path.join(train_dir, 'PNEUMONIA', os.listdir(os.path.join(train_dir, 'PNEUMONIA'))[0])

print("Normal X-Ray Preprocessing:")
show_preprocessing_steps(normal_img)

print("\nPneumonia X-Ray Preprocessing:")
show_preprocessing_steps(pneumonia_img)

## Pixel Intensity Analysis

In [None]:
def analyze_pixel_intensities(data_dir, num_samples=20):
    """Analyze pixel intensity distributions"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    categories = ['NORMAL', 'PNEUMONIA']
    colors = ['blue', 'red']
    
    for i, (category, color) in enumerate(zip(categories, colors)):
        category_path = os.path.join(data_dir, category)
        image_files = os.listdir(category_path)[:num_samples]
        
        all_intensities = []
        
        for image_file in image_files:
            img_path = os.path.join(category_path, image_file)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img_resized = cv2.resize(img, (224, 224))
            all_intensities.extend(img_resized.flatten())
        
        # Plot histogram
        axes[i].hist(all_intensities, bins=50, alpha=0.7, color=color, density=True)
        axes[i].set_title(f'{category} - Pixel Intensity Distribution')
        axes[i].set_xlabel('Pixel Intensity')
        axes[i].set_ylabel('Density')
        axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

analyze_pixel_intensities(train_dir)

## Key Insights

1. **Dataset Imbalance**: The dataset shows class imbalance with more pneumonia cases than normal cases
2. **Image Variability**: X-ray images have varying dimensions and file sizes
3. **Preprocessing Impact**: CLAHE significantly improves contrast in chest X-rays
4. **Pixel Patterns**: Normal and pneumonia X-rays show different intensity distributions

## Next Steps

1. Implement data augmentation to handle class imbalance
2. Use transfer learning with pre-trained models
3. Apply proper preprocessing pipeline
4. Evaluate multiple model architectures