In [None]:
# --- Cell 1: Import Libraries and Set Up Environment ---
"""
# Handwritten Character Recognition: Data Analysis Notebook

This notebook focuses on data analysis, preprocessing, and visualization for handwritten character recognition.
It explores dataset characteristics, demonstrates transformations, and showcases data augmentation techniques.
"""

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import cv2
from PIL import Image, ImageOps, ImageEnhance, ImageFilter

# PyTorch imports
import torch
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torchvision import datasets

# Import utility modules
from data_utils_file import HandwritingDataPipeline, get_class_labels_from_directory
from data_utils_file import RandomChoice, ThicknessTransform, create_custom_transforms

# For reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Create directories for saving visualizations
os.makedirs("data_analysis", exist_ok=True)
os.makedirs("data_analysis/transformations", exist_ok=True)
os.makedirs("data_analysis/augmentations", exist_ok=True)
os.makedirs("data_analysis/statistics", exist_ok=True)

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(style="whitegrid")
sns.set_palette("muted")



In [None]:
# --- Cell 2: Data Loading and Exploration ---
"""
## Data Loading and Exploration

In this section, we'll load the handwritten character dataset and explore its basic properties:
- Dataset structure and size
- Class distribution
- Sample images from each class
"""

def load_and_explore_dataset(data_root):
    """
    Load and explore the handwritten character dataset.
    
    Args:
        data_root: Path to the dataset root directory
        
    Returns:
        tuple: (dataset, class_names, stats)
    """
    print(f"Loading dataset from: {data_root}")
    
    if not os.path.exists(data_root):
        print(f"ERROR: Data root directory '{data_root}' not found")
        return None, None, None
    
    try:
        # Get class names from directory structure
        class_names = get_class_labels_from_directory(data_root)
        print(f"Found {len(class_names)} classes: {', '.join(class_names[:10])}...")
        
        # Load dataset using ImageFolder
        dataset = datasets.ImageFolder(root=data_root)
        print(f"Dataset loaded with {len(dataset)} samples")
        
        # Calculate class distribution
        class_counts = Counter()
        for _, label in dataset.samples:
            class_counts[dataset.classes[label]] += 1
        
        # Get basic dataset statistics
        stats = {
            'total_samples': len(dataset),
            'num_classes': len(class_names),
            'class_counts': dict(class_counts),
            'min_samples_per_class': min(class_counts.values()),
            'max_samples_per_class': max(class_counts.values()),
            'avg_samples_per_class': sum(class_counts.values()) / len(class_counts)
        }
        
        print(f"Dataset statistics:")
        print(f"  Total samples: {stats['total_samples']}")
        print(f"  Number of classes: {stats['num_classes']}")
        print(f"  Min samples per class: {stats['min_samples_per_class']}")
        print(f"  Max samples per class: {stats['max_samples_per_class']}")
        print(f"  Avg samples per class: {stats['avg_samples_per_class']:.2f}")
        
        return dataset, class_names, stats
    
    except Exception as e:
        print(f"Error exploring dataset: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None

def visualize_class_distribution(stats, save_path=None):
    """Visualize the class distribution as a histogram."""
    if not stats or 'class_counts' not in stats:
        print("No statistics available for visualization")
        return
    
    class_counts = stats['class_counts']
    
    # Sort by count for better visualization
    sorted_counts = dict(sorted(class_counts.items(), key=lambda x: x[1], reverse=True))
    
    # Plot histogram of class distribution
    plt.figure(figsize=(12, 6))
    
    # If we have too many classes, limit the display
    if len(sorted_counts) > 30:
        # Show top 15 and bottom 15 classes
        top_classes = list(sorted_counts.keys())[:15]
        bottom_classes = list(sorted_counts.keys())[-15:]
        selected_classes = top_classes + bottom_classes
        selected_counts = {cls: sorted_counts[cls] for cls in selected_classes}
        
        bars = plt.bar(range(len(selected_counts)), selected_counts.values())
        plt.xticks(range(len(selected_counts)), selected_counts.keys(), rotation=90)
        
        # Add a text annotation indicating that middle classes are omitted
        mid_point = len(top_classes) - 0.5
        plt.axvline(x=mid_point, color='red', linestyle='--')
        plt.text(mid_point, max(selected_counts.values())/2, 'Middle classes omitted', 
                rotation=90, verticalalignment='center', horizontalalignment='center')
    else:
        bars = plt.bar(range(len(sorted_counts)), sorted_counts.values())
        plt.xticks(range(len(sorted_counts)), sorted_counts.keys(), rotation=90)
    
    # Add count labels on top of bars
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 5,
                f'{height}', ha='center', va='bottom', fontsize=8)
    
    plt.xlabel('Class')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Dataset')
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Class distribution saved to {save_path}")
    
    plt.show()

def visualize_sample_images(dataset, num_classes=10, samples_per_class=5, figsize=(15, 10), save_path=None):
    """Visualize sample images from each class."""
    if dataset is None:
        print("No dataset available for visualization")
        return
    
    # Organize samples by class
    samples_by_class = {}
    for idx, (_, label) in enumerate(dataset.samples):
        class_name = dataset.classes[label]
        if class_name not in samples_by_class:
            samples_by_class[class_name] = []
        samples_by_class[class_name].append(idx)
    
    # Limit number of classes to display
    selected_classes = list(samples_by_class.keys())[:num_classes]
    
    # Create figure
    fig, axes = plt.subplots(len(selected_classes), samples_per_class, figsize=figsize)
    
    for i, class_name in enumerate(selected_classes):
        # Get random samples for this class
        class_indices = random.sample(samples_by_class[class_name], 
                                     min(samples_per_class, len(samples_by_class[class_name])))
        
        for j, idx in enumerate(class_indices):
            img, _ = dataset[idx]
            axes[i, j].imshow(img)
            axes[i, j].axis('off')
            
            # Add class label to the first image in each row
            if j == 0:
                axes[i, j].set_title(f"Class: {class_name}")
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Sample images saved to {save_path}")
    
    plt.show()

# Example usage (commented out)
"""
# Specify your dataset path
DATA_ROOT = "./datasets/handwritten-english/augmented_images1"

# Load and explore the dataset
dataset, class_names, stats = load_and_explore_dataset(DATA_ROOT)

# Visualize class distribution
visualize_class_distribution(stats, save_path="data_analysis/statistics/class_distribution.png")

# Visualize sample images
visualize_sample_images(dataset, num_classes=10, samples_per_class=5, 
                     save_path="data_analysis/statistics/sample_images.png")
"""



In [None]:
# --- Cell 3: Image Property Analysis ---
"""
## Image Property Analysis

Analyze the properties of the images in the dataset:
- Image sizes and aspect ratios
- Pixel intensity distributions
- Stroke characteristics
"""

def analyze_image_properties(dataset, sample_size=100, save_dir="data_analysis/statistics"):
    """Analyze various properties of images in the dataset."""
    if dataset is None or len(dataset) == 0:
        print("No dataset available for analysis")
        return None
    
    # Randomly sample images for analysis
    sample_indices = random.sample(range(len(dataset)), min(sample_size, len(dataset)))
    
    # Image properties to collect
    widths = []
    heights = []
    aspect_ratios = []
    mean_intensities = []
    std_intensities = []
    histograms = []
    
    for idx in sample_indices:
        img, _ = dataset[idx]
        
        # Convert PIL image to numpy array if needed
        if isinstance(img, Image.Image):
            img_np = np.array(img.convert('L'))
        else:
            img_np = img
        
        # Image dimensions
        h, w = img_np.shape[:2]
        widths.append(w)
        heights.append(h)
        aspect_ratios.append(w/h)
        
        # Pixel intensity statistics
        mean_intensities.append(np.mean(img_np))
        std_intensities.append(np.std(img_np))
        
        # Calculate histogram
        hist, _ = np.histogram(img_np.flatten(), bins=256, range=[0, 256])
        histograms.append(hist)
    
    # Combine histograms
    avg_histogram = np.mean(histograms, axis=0)
    
    # Create results dictionary
    results = {
        'width': {
            'mean': np.mean(widths),
            'std': np.std(widths),
            'min': np.min(widths),
            'max': np.max(widths),
            'values': widths
        },
        'height': {
            'mean': np.mean(heights),
            'std': np.std(heights),
            'min': np.min(heights),
            'max': np.max(heights),
            'values': heights
        },
        'aspect_ratio': {
            'mean': np.mean(aspect_ratios),
            'std': np.std(aspect_ratios),
            'min': np.min(aspect_ratios),
            'max': np.max(aspect_ratios),
            'values': aspect_ratios
        },
        'mean_intensity': {
            'mean': np.mean(mean_intensities),
            'std': np.std(mean_intensities),
            'min': np.min(mean_intensities),
            'max': np.max(mean_intensities),
            'values': mean_intensities
        },
        'std_intensity': {
            'mean': np.mean(std_intensities),
            'std': np.std(std_intensities),
            'min': np.min(std_intensities),
            'max': np.max(std_intensities),
            'values': std_intensities
        },
        'avg_histogram': avg_histogram
    }
    
    # Print summary statistics
    print("Image Property Analysis:")
    print(f"  Width (pixels): mean={results['width']['mean']:.2f}, std={results['width']['std']:.2f}, "
          f"min={results['width']['min']}, max={results['width']['max']}")
    print(f"  Height (pixels): mean={results['height']['mean']:.2f}, std={results['height']['std']:.2f}, "
          f"min={results['height']['min']}, max={results['height']['max']}")
    print(f"  Aspect Ratio (w/h): mean={results['aspect_ratio']['mean']:.2f}, std={results['aspect_ratio']['std']:.2f}, "
          f"min={results['aspect_ratio']['min']:.2f}, max={results['aspect_ratio']['max']:.2f}")
    print(f"  Mean Intensity: mean={results['mean_intensity']['mean']:.2f}, std={results['mean_intensity']['std']:.2f}")
    print(f"  Std Intensity: mean={results['std_intensity']['mean']:.2f}, std={results['std_intensity']['std']:.2f}")
    
    # Create visualizations
    # 1. Size distribution
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.hist(widths, bins=20, alpha=0.7, color='blue')
    plt.axvline(x=np.mean(widths), color='red', linestyle='--')
    plt.xlabel('Width (pixels)')
    plt.ylabel('Frequency')
    plt.title(f'Width Distribution (mean={np.mean(widths):.1f})')
    
    plt.subplot(1, 3, 2)
    plt.hist(heights, bins=20, alpha=0.7, color='green')
    plt.axvline(x=np.mean(heights), color='red', linestyle='--')
    plt.xlabel('Height (pixels)')
    plt.ylabel('Frequency')
    plt.title(f'Height Distribution (mean={np.mean(heights):.1f})')
    
    plt.subplot(1, 3, 3)
    plt.hist(aspect_ratios, bins=20, alpha=0.7, color='purple')
    plt.axvline(x=np.mean(aspect_ratios), color='red', linestyle='--')
    plt.xlabel('Aspect Ratio (width/height)')
    plt.ylabel('Frequency')
    plt.title(f'Aspect Ratio Distribution (mean={np.mean(aspect_ratios):.2f})')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/size_distributions.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Intensity distributions
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.hist(mean_intensities, bins=20, alpha=0.7, color='orange')
    plt.axvline(x=np.mean(mean_intensities), color='red', linestyle='--')
    plt.xlabel('Mean Intensity')
    plt.ylabel('Frequency')
    plt.title(f'Mean Intensity Distribution (mean={np.mean(mean_intensities):.1f})')
    
    plt.subplot(1, 3, 2)
    plt.hist(std_intensities, bins=20, alpha=0.7, color='cyan')
    plt.axvline(x=np.mean(std_intensities), color='red', linestyle='--')
    plt.xlabel('Std Intensity')
    plt.ylabel('Frequency')
    plt.title(f'Std Intensity Distribution (mean={np.mean(std_intensities):.1f})')
    
    plt.subplot(1, 3, 3)
    plt.bar(range(256), avg_histogram, alpha=0.7, color='teal')
    plt.xlabel('Pixel Intensity')
    plt.ylabel('Average Frequency')
    plt.title('Average Intensity Histogram')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/intensity_distributions.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    return results

def extract_stroke_features(dataset, sample_size=20, save_dir="data_analysis/statistics"):
    """Extract and analyze stroke features in handwritten characters."""
    if dataset is None or len(dataset) == 0:
        print("No dataset available for analysis")
        return None
    
    # Randomly sample images for analysis
    sample_indices = random.sample(range(len(dataset)), min(sample_size, len(dataset)))
    
    # Features to extract
    stroke_thickness = []
    stroke_continuity = []  # Measured by number of contours
    character_density = []  # Ratio of foreground to total pixels
    
    # Visualization samples
    vis_samples = []
    
    for i, idx in enumerate(sample_indices):
        img, label = dataset[idx]
        class_name = dataset.classes[label]
        
        # Convert PIL image to numpy array
        if isinstance(img, Image.Image):
            img_np = np.array(img.convert('L'))
        else:
            img_np = img
        
        # Ensure binary image (black text on white background)
        _, binary = cv2.threshold(img_np, 128, 255, cv2.THRESH_BINARY)
        
        # Invert for processing (white text on black background)
        binary_inv = cv2.bitwise_not(binary)
        
        # Find contours
        contours, _ = cv2.findContours(binary_inv, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Measure stroke thickness using distance transform
        dist_transform = cv2.distanceTransform(binary_inv, cv2.DIST_L2, 5)
        max_thickness = np.max(dist_transform) * 2  # Diameter is twice the radius
        
        # Calculate character density (ratio of foreground to total pixels)
        density = np.sum(binary_inv > 0) / (binary_inv.shape[0] * binary_inv.shape[1])
        
        # Store features
        stroke_thickness.append(max_thickness)
        stroke_continuity.append(len(contours))
        character_density.append(density)
        
        # Store visualization samples (first 5)
        if i < 5:
            # Create visualization of thickness
            thickness_vis = cv2.normalize(dist_transform, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
            thickness_vis = cv2.applyColorMap(thickness_vis, cv2.COLORMAP_JET)
            
            # Original binary image for comparison
            binary_rgb = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
            
            # Draw contours on a copy of the original
            contour_img = binary_rgb.copy()
            cv2.drawContours(contour_img, contours, -1, (0, 255, 0), 1)
            
            vis_samples.append({
                'class': class_name,
                'original': binary_rgb,
                'thickness': thickness_vis,
                'contours': contour_img,
                'features': {
                    'max_thickness': max_thickness,
                    'num_contours': len(contours),
                    'density': density
                }
            })
    
    # Compute statistics
    results = {
        'stroke_thickness': {
            'mean': np.mean(stroke_thickness),
            'std': np.std(stroke_thickness),
            'min': np.min(stroke_thickness),
            'max': np.max(stroke_thickness)
        },
        'stroke_continuity': {
            'mean': np.mean(stroke_continuity),
            'std': np.std(stroke_continuity),
            'min': np.min(stroke_continuity),
            'max': np.max(stroke_continuity)
        },
        'character_density': {
            'mean': np.mean(character_density),
            'std': np.std(character_density),
            'min': np.min(character_density),
            'max': np.max(character_density)
        }
    }
    
    # Print summary
    print("Stroke Feature Analysis:")
    print(f"  Max Stroke Thickness: mean={results['stroke_thickness']['mean']:.2f} pixels, "
          f"std={results['stroke_thickness']['std']:.2f}")
    print(f"  Number of Contours: mean={results['stroke_continuity']['mean']:.2f}, "
          f"std={results['stroke_continuity']['std']:.2f}")
    print(f"  Character Density: mean={results['character_density']['mean']:.2f}, "
          f"std={results['character_density']['std']:.2f}")
    
    # Create visualizations
    # 1. Feature distributions
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.hist(stroke_thickness, bins=20, alpha=0.7, color='red')
    plt.axvline(x=np.mean(stroke_thickness), color='blue', linestyle='--')
    plt.xlabel('Max Stroke Thickness (pixels)')
    plt.ylabel('Frequency')
    plt.title('Stroke Thickness Distribution')
    
    plt.subplot(1, 3, 2)
    plt.hist(stroke_continuity, bins=max(10, max(stroke_continuity)), alpha=0.7, color='green')
    plt.axvline(x=np.mean(stroke_continuity), color='blue', linestyle='--')
    plt.xlabel('Number of Contours')
    plt.ylabel('Frequency')
    plt.title('Stroke Continuity Distribution')
    
    plt.subplot(1, 3, 3)
    plt.hist(character_density, bins=20, alpha=0.7, color='purple')
    plt.axvline(x=np.mean(character_density), color='blue', linestyle='--')
    plt.xlabel('Character Density')
    plt.ylabel('Frequency')
    plt.title('Character Density Distribution')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/stroke_feature_distributions.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Visualization samples
    if vis_samples:
        plt.figure(figsize=(15, 10))
        
        for i, sample in enumerate(vis_samples):
            # Original
            plt.subplot(5, 3, i*3 + 1)
            plt.imshow(sample['original'])
            plt.title(f"Class: {sample['class']}")
            plt.axis('off')
            
            # Thickness map
            plt.subplot(5, 3, i*3 + 2)
            plt.imshow(sample['thickness'])
            plt.title(f"Thickness Map\nMax: {sample['features']['max_thickness']:.1f}px")
            plt.axis('off')
            
            # Contours
            plt.subplot(5, 3, i*3 + 3)
            plt.imshow(sample['contours'])
            plt.title(f"Contours: {sample['features']['num_contours']}\nDensity: {sample['features']['density']:.2f}")
            plt.axis('off')
        
        plt.tight_layout()
        plt.savefig(f"{save_dir}/stroke_feature_samples.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    return results

# Example usage (commented out)
"""
# Analyze image properties
image_properties = analyze_image_properties(dataset, sample_size=100)

# Extract and analyze stroke features
stroke_features = extract_stroke_features(dataset, sample_size=20)
"""



In [None]:
# --- Cell 4: Basic Image Transformations ---
"""
## Basic Image Transformations

Explore the effect of basic image transformations on handwritten characters:
- Resizing and scaling
- Rotation and affine transformations
- Thresholding and binarization
- Morphological operations
"""

def apply_basic_transformations(img, save_dir="data_analysis/transformations"):
    """Apply and visualize basic transformations on a single image."""
    if img is None:
        print("No image provided for transformations")
        return
    
    # Convert to PIL image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Ensure grayscale
    img_gray = img.convert('L')
    
    # Basic transformations
    transformations = {
        'Original': img_gray,
        'Resize (32x32)': img_gray.resize((32, 32), Image.LANCZOS),
        'Resize (128x128)': img_gray.resize((128, 128), Image.LANCZOS),
        'Rotate (15°)': img_gray.rotate(15, fillcolor=255),
        'Rotate (45°)': img_gray.rotate(45, fillcolor=255),
        'Invert': ImageOps.invert(img_gray),
        'Flip Horizontal': ImageOps.mirror(img_gray),
        'Flip Vertical': ImageOps.flip(img_gray),
        'Brighten': ImageEnhance.Brightness(img_gray).enhance(1.5),
        'Darken': ImageEnhance.Brightness(img_gray).enhance(0.5),
        'Contrast+': ImageEnhance.Contrast(img_gray).enhance(2.0),
        'Contrast-': ImageEnhance.Contrast(img_gray).enhance(0.5),
        'Blur': img_gray.filter(ImageFilter.GaussianBlur(radius=1)),
        'Sharpen': img_gray.filter(ImageFilter.SHARPEN),
        'Edge Enhance': img_gray.filter(ImageFilter.EDGE_ENHANCE),
        'Find Edges': img_gray.filter(ImageFilter.FIND_EDGES),
    }
    
    # Convert to numpy for OpenCV operations
    img_np = np.array(img_gray)
    
    # OpenCV transformations
    # Binarization
    _, binary_otsu = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    binary_adaptive = cv2.adaptiveThreshold(img_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                          cv2.THRESH_BINARY, 11, 2)
    
    # Morphological operations
    kernel = np.ones((3, 3), np.uint8)
    erosion = cv2.erode(img_np, kernel, iterations=1)
    dilation = cv2.dilate(img_np, kernel, iterations=1)
    opening = cv2.morphologyEx(img_np, cv2.MORPH_OPEN, kernel)
    closing = cv2.morphologyEx(img_np, cv2.MORPH_CLOSE, kernel)
    
    # Add OpenCV transformations to dictionary
    cv2_transforms = {
        'Binary (Otsu)': binary_otsu,
        'Binary (Adaptive)': binary_adaptive,
        'Erosion': erosion,
        'Dilation': dilation,
        'Opening': opening,
        'Closing': closing
    }
    
    # Convert OpenCV results to PIL
    for name, img_cv in cv2_transforms.items():
        transformations[name] = Image.fromarray(img_cv)
    
    # Display transformations
    n_transforms = len(transformations)
    n_cols = 4
    n_rows = (n_transforms + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 3 * n_rows))
    axes = axes.flatten()
    
    for i, (name, trans_img) in enumerate(transformations.items()):
        axes[i].imshow(trans_img, cmap='gray')
        axes[i].set_title(name)
        axes[i].axis('off')
    
    # Hide unused subplots
    for i in range(n_transforms, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/basic_transformations.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    return transformations

def apply_affine_transformations(img, save_dir="data_analysis/transformations"):
    """Apply and visualize affine transformations on a single image."""
    if img is None:
        print("No image provided for transformations")
        return
    
    # Convert to PIL image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Ensure grayscale
    img_gray = img.convert('L')
    img_np = np.array(img_gray)
    
    # Get image dimensions
    h, w = img_np.shape
    
    # Affine transformations with OpenCV
    transformations = {}
    
    # Translation
    M_right = np.float32([[1, 0, w//4], [0, 1, 0]])
    M_down = np.float32([[1, 0, 0], [0, 1, h//4]])
    trans_right = cv2.warpAffine(img_np, M_right, (w, h), borderValue=255)
    trans_down = cv2.warpAffine(img_np, M_down, (w, h), borderValue=255)
    
    # Rotation with different centers
    center = (w//2, h//2)
    M_rot15 = cv2.getRotationMatrix2D(center, 15, 1.0)
    M_rot30 = cv2.getRotationMatrix2D(center, 30, 1.0)
    M_rot45 = cv2.getRotationMatrix2D(center, 45, 1.0)
    rot15 = cv2.warpAffine(img_np, M_rot15, (w, h), borderValue=255)
    rot30 = cv2.warpAffine(img_np, M_rot30, (w, h), borderValue=255)
    rot45 = cv2.warpAffine(img_np, M_rot45, (w, h), borderValue=255)
    
    # Scaling
    M_scale_up = cv2.getRotationMatrix2D(center, 0, 1.5)
    M_scale_down = cv2.getRotationMatrix2D(center, 0, 0.7)
    scale_up = cv2.warpAffine(img_np, M_scale_up, (w, h), borderValue=255)
    scale_down = cv2.warpAffine(img_np, M_scale_down, (w, h), borderValue=255)
    
    # Shearing
    pts1 = np.float32([[0, 0], [w, 0], [0, h]])
    # Shear X
    pts2_x = np.float32([[0, 0], [w, h//4], [0, h]])
    M_shear_x = cv2.getAffineTransform(pts1, pts2_x)
    shear_x = cv2.warpAffine(img_np, M_shear_x, (w, h), borderValue=255)
    # Shear Y
    pts2_y = np.float32([[w//4, 0], [w, 0], [0, h]])
    M_shear_y = cv2.getAffineTransform(pts1, pts2_y)
    shear_y = cv2.warpAffine(img_np, M_shear_y, (w, h), borderValue=255)
    
    # Perspective transformation
    pts1 = np.float32([[0, 0], [w, 0], [0, h], [w, h]])
    pts2 = np.float32([[0, 0], [w, 0], [w//4, h], [3*w//4, h]])
    M_persp = cv2.getPerspectiveTransform(pts1, pts2)
    perspective = cv2.warpPerspective(img_np, M_persp, (w, h), borderValue=255)
    
    # Store transformations
    transformations = {
        'Original': img_np,
        'Translate Right': trans_right,
        'Translate Down': trans_down,
        'Rotate 15°': rot15,
        'Rotate 30°': rot30,
        'Rotate 45°': rot45,
        'Scale Up (1.5x)': scale_up,
        'Scale Down (0.7x)': scale_down,
        'Shear X': shear_x,
        'Shear Y': shear_y,
        'Perspective': perspective
    }
    
    # Display transformations
    n_transforms = len(transformations)
    n_cols = 4
    n_rows = (n_transforms + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 3 * n_rows))
    axes = axes.flatten()
    
    for i, (name, trans_img) in enumerate(transformations.items()):
        axes[i].imshow(trans_img, cmap='gray')
        axes[i].set_title(name)
        axes[i].axis('off')
    
    # Hide unused subplots
    for i in range(n_transforms, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/affine_transformations.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    return transformations

def visualize_transformations_on_samples(dataset, num_samples=3, save_dir="data_analysis/transformations"):
    """Apply and visualize transformations on multiple samples from the dataset."""
    if dataset is None or len(dataset) == 0:
        print("No dataset available for transformation visualization")
        return
    
    # Randomly select samples
    sample_indices = random.sample(range(len(dataset)), min(num_samples, len(dataset)))
    
    for i, idx in enumerate(sample_indices):
        img, label = dataset[idx]
        class_name = dataset.classes[label]
        
        print(f"\nSample {i+1}: Class '{class_name}'")
        
        # Apply basic transformations
        print("Applying basic transformations...")
        basic_trans = apply_basic_transformations(img, save_dir=f"{save_dir}/sample_{i+1}_class_{class_name}")
        
        # Apply affine transformations
        print("Applying affine transformations...")
        affine_trans = apply_affine_transformations(img, save_dir=f"{save_dir}/sample_{i+1}_class_{class_name}")

# Example usage (commented out)
"""
# Visualize transformations on sample images
visualize_transformations_on_samples(dataset, num_samples=3)
"""



In [None]:
# --- Cell 5: Data Augmentation Experiments ---
"""
## Data Augmentation Experiments

Explore various data augmentation techniques for handwritten character recognition:
- Comparing different augmentation strategies
- Visualizing augmented samples
- Creating custom augmentation pipelines
"""

def visualize_augmentation_techniques(img, save_dir="data_analysis/augmentations"):
    """Visualize different augmentation techniques on a single image."""
    if img is None:
        print("No image provided for augmentation")
        return
    
    # Convert to PIL image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Ensure grayscale
    img_gray = img.convert('L')
    
    # Basic augmentations
    basic_transforms = {
        'Original': transforms.Compose([
            transforms.ToTensor(),
            transforms.ToPILImage()
        ]),
        'RandomRotation(±30°)': transforms.Compose([
            transforms.RandomRotation(30, fill=255),
            transforms.ToTensor(),
            transforms.ToPILImage()
        ]),
        'RandomAffine': transforms.Compose([
            transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10, fill=255),
            transforms.ToTensor(),
            transforms.ToPILImage()
        ]),
        'ColorJitter': transforms.Compose([
            transforms.ColorJitter(brightness=0.3, contrast=0.3),
            transforms.ToTensor(),
            transforms.ToPILImage()
        ]),
        'GaussianBlur': transforms.Compose([
            transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 1.0)),
            transforms.ToTensor(),
            transforms.ToPILImage()
        ]),
        'RandomPerspective': transforms.Compose([
            transforms.RandomPerspective(distortion_scale=0.3, p=1.0, fill=255),
            transforms.ToTensor(),
            transforms.ToPILImage()
        ]),
        'Thickness': ThicknessTransform(kernel_range=(1, 3), p=1.0)
    }
    
    # Custom augmentation pipelines
    custom_pipelines = {
        'Light Augmentation': create_custom_transforms('light'),
        'Medium Augmentation': create_custom_transforms('medium'),
        'Heavy Augmentation': create_custom_transforms('heavy')
    }
    
    # Apply basic transformations
    basic_results = {}
    for name, transform in basic_transforms.items():
        basic_results[name] = transform(img_gray)
    
    # Apply pipelines (need to convert back from tensor)
    pipeline_results = {}
    for name, pipeline in custom_pipelines.items():
        # Apply multiple times to show variation
        pipeline_results[name] = []
        for _ in range(3):
            tensor = pipeline(img_gray)
            # Convert tensor to PIL image
            img_aug = transforms.ToPILImage()(torch.clamp(tensor, 0, 1))
            pipeline_results[name].append(img_aug)
    
    # Visualize basic augmentations
    plt.figure(figsize=(15, 10))
    
    # Basic augmentations
    for i, (name, img_aug) in enumerate(basic_results.items()):
        plt.subplot(3, 3, i + 1)
        plt.imshow(img_aug, cmap='gray')
        plt.title(name)
        plt.axis('off')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/basic_augmentations.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    # Visualize pipeline augmentations
    plt.figure(figsize=(15, 10))
    
    row = 0
    for name, img_list in pipeline_results.items():
        for i, img_aug in enumerate(img_list):
            plt.subplot(3, 3, row*3 + i + 1)
            
            # Convert tensor to numpy for display
            plt.imshow(img_aug, cmap='gray')
            plt.title(f"{name} (Sample {i+1})")
            plt.axis('off')
        row += 1
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/pipeline_augmentations.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    return basic_results, pipeline_results

def create_augmentation_grid(img, n_samples=10, transform=None, save_path=None):
    """Create a grid of augmented samples using a single transform."""
    if img is None:
        print("No image provided for augmentation grid")
        return
    
    if transform is None:
        print("No transform provided for augmentation grid")
        return
    
    # Convert to PIL image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Apply the transform multiple times
    samples = []
    for _ in range(n_samples):
        try:
            aug_img = transform(img)
            
            # If it's a tensor, convert back to PIL
            if isinstance(aug_img, torch.Tensor):
                aug_img = transforms.ToPILImage()(torch.clamp(aug_img, 0, 1))
                
            samples.append(aug_img)
        except Exception as e:
            print(f"Error applying transform: {e}")
    
    # Create grid
    rows = int(np.ceil(np.sqrt(n_samples)))
    cols = int(np.ceil(n_samples / rows))
    
    fig, axes = plt.subplots(rows, cols, figsize=(cols*2, rows*2))
    axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]
    
    # Add original image as first
    axes[0].imshow(img, cmap='gray')
    axes[0].set_title("Original")
    axes[0].axis('off')
    
    # Add augmented samples
    for i, sample in enumerate(samples):
        if i+1 < len(axes):
            axes[i+1].imshow(sample, cmap='gray')
            axes[i+1].set_title(f"Aug {i+1}")
            axes[i+1].axis('off')
    
    # Hide unused subplots
    for i in range(n_samples+1, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()
    return samples

def compare_augmentation_strategies(dataset, save_dir="data_analysis/augmentations"):
    """Compare different augmentation strategies on dataset samples."""
    if dataset is None or len(dataset) == 0:
        print("No dataset available for augmentation comparison")
        return
    
    # Define augmentation strategies
    strategies = {
        'Light': create_custom_transforms('light'),
        'Medium': create_custom_transforms('medium'),
        'Heavy': create_custom_transforms('heavy'),
        'Rotation-focused': transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.Grayscale(num_output_channels=1),
            transforms.RandomRotation(30, fill=255),
            transforms.ToTensor(),
            transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
        'Distortion-focused': transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.Grayscale(num_output_channels=1),
            transforms.RandomPerspective(distortion_scale=0.4, p=0.7, fill=255),
            transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=15, fill=255),
            transforms.ToTensor(),
            transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
        'Noise-focused': transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.Grayscale(num_output_channels=1),
            transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 1.0)),
            transforms.ColorJitter(brightness=0.3, contrast=0.3),
            transforms.ToTensor(),
            transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ]),
        'Custom-Thickness': transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.Grayscale(num_output_channels=1),
            ThicknessTransform(kernel_range=(1, 3), p=0.7),
            transforms.ToTensor(),
            transforms.Lambda(lambda x: x.repeat(3, 1, 1) if x.size(0) == 1 else x),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    }
    
    # Select a few samples
    sample_indices = random.sample(range(len(dataset)), min(3, len(dataset)))
    
    for i, idx in enumerate(sample_indices):
        img, label = dataset[idx]
        class_name = dataset.classes[label]
        
        print(f"\nSample {i+1}: Class '{class_name}'")
        
        # Create a directory for this sample
        sample_dir = f"{save_dir}/sample_{i+1}_class_{class_name}"
        os.makedirs(sample_dir, exist_ok=True)
        
        # Apply each strategy
        for name, strategy in strategies.items():
            print(f"Applying {name} augmentation strategy...")
            save_path = f"{sample_dir}/{name}_strategy.png"
            create_augmentation_grid(img, n_samples=9, transform=strategy, save_path=save_path)
    
    # Compare and analyze augmentation diversity across strategies
    # (Advanced analysis could be added here)
    
    return strategies

# Example usage (commented out)
"""
# Visualize augmentation techniques on a sample image
sample_idx = random.randint(0, len(dataset)-1)
sample_img, _ = dataset[sample_idx]
basic_augs, pipeline_augs = visualize_augmentation_techniques(sample_img)

# Compare different augmentation strategies
augmentation_strategies = compare_augmentation_strategies(dataset)
"""



In [None]:
# --- Cell 6: Preprocessing and Normalization ---
"""
## Preprocessing and Normalization

Explore preprocessing and normalization techniques for handwritten character recognition:
- Noise removal
- Thresholding and binarization methods
- Normalization approaches
- Character extraction and segmentation
"""

def explore_preprocessing_techniques(img, save_dir="data_analysis/transformations"):
    """Explore various preprocessing techniques for handwritten character recognition."""
    if img is None:
        print("No image provided for preprocessing")
        return
    
    # Convert to PIL image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Ensure grayscale
    img_gray = img.convert('L')
    img_np = np.array(img_gray)
    
    # Basic preprocessing
    # 1. Noise removal
    blur = cv2.GaussianBlur(img_np, (3, 3), 0)
    median = cv2.medianBlur(img_np, 3)
    bilateral = cv2.bilateralFilter(img_np, 9, 75, 75)
    
    # 2. Thresholding
    _, binary_global = cv2.threshold(img_np, 127, 255, cv2.THRESH_BINARY)
    _, binary_otsu = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    binary_adaptive = cv2.adaptiveThreshold(img_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 11, 2)
    
    # 3. Edge detection
    edges_sobel_x = cv2.Sobel(img_np, cv2.CV_64F, 1, 0, ksize=3)
    edges_sobel_y = cv2.Sobel(img_np, cv2.CV_64F, 0, 1, ksize=3)
    edges_sobel = cv2.magnitude(edges_sobel_x, edges_sobel_y)
    edges_sobel = cv2.normalize(edges_sobel, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    
    edges_canny = cv2.Canny(img_np, 100, 200)
    
    # 4. Morphological operations
    kernel = np.ones((3, 3), np.uint8)
    erosion = cv2.erode(binary_otsu, kernel, iterations=1)
    dilation = cv2.dilate(binary_otsu, kernel, iterations=1)
    opening = cv2.morphologyEx(binary_otsu, cv2.MORPH_OPEN, kernel)
    closing = cv2.morphologyEx(binary_otsu, cv2.MORPH_CLOSE, kernel)
    
    # 5. Skeletonization
    # Using distance transform and threshold for a simple skeleton approximation
    dist_transform = cv2.distanceTransform(binary_otsu, cv2.DIST_L2, 5)
    dist_transform = cv2.normalize(dist_transform, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    _, skeleton_approx = cv2.threshold(dist_transform, 50, 255, cv2.THRESH_BINARY)
    
    # Combine preprocessing steps
    # Clean -> Binarize -> Thin
    clean_binary_thin = cv2.erode(binary_adaptive, kernel, iterations=1)
    
    # Create a dictionary of all preprocessing results
    preprocessing = {
        'Original': img_np,
        'Gaussian Blur': blur,
        'Median Blur': median,
        'Bilateral Filter': bilateral,
        'Global Threshold': binary_global,
        'Otsu Threshold': binary_otsu,
        'Adaptive Threshold': binary_adaptive,
        'Sobel Edges': edges_sobel,
        'Canny Edges': edges_canny,
        'Erosion': erosion,
        'Dilation': dilation,
        'Opening': opening,
        'Closing': closing,
        'Skeleton Approximation': skeleton_approx,
        'Clean+Binary+Thin': clean_binary_thin
    }
    
    # Display preprocessing results
    n_preproc = len(preprocessing)
    n_cols = 4
    n_rows = (n_preproc + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 3 * n_rows))
    axes = axes.flatten()
    
    for i, (name, proc_img) in enumerate(preprocessing.items()):
        axes[i].imshow(proc_img, cmap='gray')
        axes[i].set_title(name)
        axes[i].axis('off')
    
    # Hide unused subplots
    for i in range(n_preproc, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/preprocessing_techniques.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    return preprocessing

def explore_normalization_methods(img, save_dir="data_analysis/transformations"):
    """Explore different normalization methods for neural network inputs."""
    if img is None:
        print("No image provided for normalization")
        return
    
    # Convert to PIL image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Ensure grayscale
    img_gray = img.convert('L')
    
    # Resize to common input size for neural networks
    img_resized = img_gray.resize((64, 64), Image.LANCZOS)
    img_np = np.array(img_resized)
    
    # Normalization methods
    # 1. Simple rescaling to [0, 1]
    norm_01 = img_np / 255.0
    
    # 2. Rescaling to [-1, 1]
    norm_11 = 2 * (img_np / 255.0) - 1
    
    # 3. Standard normalization (zero mean, unit variance)
    mean = np.mean(img_np)
    std = np.std(img_np)
    norm_std = (img_np - mean) / (std if std > 0 else 1)
    
    # 4. Min-max normalization
    min_val = np.min(img_np)
    max_val = np.max(img_np)
    norm_minmax = (img_np - min_val) / (max_val - min_val if max_val > min_val else 1)
    
    # 5. ImageNet normalization (common for transfer learning)
    # First, convert to RGB (3 channels)
    img_rgb = np.stack([img_np, img_np, img_np], axis=2) / 255.0
    imagenet_mean = np.array([0.485, 0.456, 0.406])
    imagenet_std = np.array([0.229, 0.224, 0.225])
    norm_imagenet = (img_rgb - imagenet_mean) / imagenet_std
    
    # Convert single channel back for visualization
    norm_imagenet_vis = norm_imagenet[:, :, 0]
    
    # Visualize normalization methods
    normalizations = {
        'Original': img_np,
        'Rescaled [0, 1]': norm_01,
        'Rescaled [-1, 1]': norm_11,
        'Standardized': norm_std,
        'Min-Max': norm_minmax,
        'ImageNet': norm_imagenet_vis
    }
    
    # Create figure
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, (name, norm_img) in enumerate(normalizations.items()):
        # Use imshow with appropriate vmin/vmax
        if name == 'Original':
            axes[i].imshow(norm_img, cmap='gray')
        else:
            # For the normalized images, use full range colormap
            vmin = np.min(norm_img)
            vmax = np.max(norm_img)
            img_display = axes[i].imshow(norm_img, cmap='gray', vmin=vmin, vmax=vmax)
            
            # Add colorbar
            plt.colorbar(img_display, ax=axes[i], fraction=0.046, pad=0.04)
            
            # Add min/max annotations
            axes[i].text(0.05, 0.95, f"Min: {vmin:.2f}", transform=axes[i].transAxes, 
                       fontsize=8, verticalalignment='top', color='red')
            axes[i].text(0.05, 0.90, f"Max: {vmax:.2f}", transform=axes[i].transAxes, 
                       fontsize=8, verticalalignment='top', color='red')
        
        axes[i].set_title(name)
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/normalization_methods.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    return normalizations

def character_segmentation_demo(img, save_dir="data_analysis/transformations"):
    """Demonstrate character segmentation from an image with multiple characters."""
    if img is None:
        print("No image provided for segmentation")
        return
    
    # Convert to PIL image if needed
    if not isinstance(img, Image.Image):
        img = Image.fromarray(img)
    
    # Ensure grayscale
    img_gray = img.convert('L')
    img_np = np.array(img_gray)
    
    # Preprocessing for segmentation
    # 1. Binarization
    _, binary = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 2. Noise removal
    kernel = np.ones((3, 3), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    
    # 3. Find contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # 4. Filter contours by size
    min_area = 50
    valid_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_area]
    
    # 5. Sort contours from left to right
    valid_contours = sorted(valid_contours, key=lambda cnt: cv2.boundingRect(cnt)[0])
    
    # Prepare visualization
    # Original image with contours
    contour_img = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB)
    cv2.drawContours(contour_img, valid_contours, -1, (0, 255, 0), 2)
    
    # Original image with bounding boxes
    bbox_img = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB)
    for i, cnt in enumerate(valid_contours):
        x, y, w, h = cv2.boundingRect(cnt)
        cv2.rectangle(bbox_img, (x, y), (x + w, y + h), (255, 0, 0), 2)
        cv2.putText(bbox_img, str(i+1), (x, y-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
    
    # Extract and normalize individual characters
    extracted_chars = []
    normalized_chars = []
    
    for i, cnt in enumerate(valid_contours):
        x, y, w, h = cv2.boundingRect(cnt)
        
        # Extract character
        char_img = img_np[y:y+h, x:x+w]
        extracted_chars.append(char_img)
        
        # Normalize character (resize to fixed size with padding)
        # Create a padded image with white background
        padding = 10  # Pixels of padding around the character
        target_size = 64
        
        # Resize while maintaining aspect ratio
        if w > h:
            new_w = target_size - 2*padding
            new_h = int(h * new_w / w)
        else:
            new_h = target_size - 2*padding
            new_w = int(w * new_h / h)
        
        resized = cv2.resize(char_img, (new_w, new_h), interpolation=cv2.INTER_AREA)
        
        # Create padded image
        padded = np.ones((target_size, target_size), dtype=np.uint8) * 255
        
        # Calculate position to paste the resized image
        x_offset = (target_size - new_w) // 2
        y_offset = (target_size - new_h) // 2
        
        # Paste the resized image
        padded[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized
        
        normalized_chars.append(padded)
    
    # Visualize the segmentation process
    plt.figure(figsize=(15, 10))
    
    # Original image
    plt.subplot(2, 2, 1)
    plt.imshow(img_np, cmap='gray')
    plt.title("Original Image")
    plt.axis('off')
    
    # Binary image
    plt.subplot(2, 2, 2)
    plt.imshow(binary, cmap='gray')
    plt.title("Binary Image")
    plt.axis('off')
    
    # Contours
    plt.subplot(2, 2, 3)
    plt.imshow(contour_img)
    plt.title(f"Contours ({len(valid_contours)} found)")
    plt.axis('off')
    
    # Bounding boxes
    plt.subplot(2, 2, 4)
    plt.imshow(bbox_img)
    plt.title("Bounding Boxes")
    plt.axis('off')
    
    plt.tight_layout()
    plt.savefig(f"{save_dir}/segmentation_process.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    # Visualize extracted and normalized characters
    if extracted_chars:
        n_chars = len(extracted_chars)
        
        plt.figure(figsize=(15, 5))
        
        # Extracted characters
        for i, char_img in enumerate(extracted_chars):
            plt.subplot(2, n_chars, i+1)
            plt.imshow(char_img, cmap='gray')
            plt.title(f"Extracted {i+1}")
            plt.axis('off')
        
        # Normalized characters
        for i, char_img in enumerate(normalized_chars):
            plt.subplot(2, n_chars, n_chars+i+1)
            plt.imshow(char_img, cmap='gray')
            plt.title(f"Normalized {i+1}")
            plt.axis('off')
        
        plt.tight_layout()
        plt.savefig(f"{save_dir}/extracted_characters.png", dpi=300, bbox_inches='tight')
        plt.show()
    
    return {
        'binary': binary,
        'contours': valid_contours,
        'extracted': extracted_chars,
        'normalized': normalized_chars
    }

# Example usage (commented out)
"""
# Get a sample image from the dataset
sample_idx = random.randint(0, len(dataset)-1)
sample_img, _ = dataset[sample_idx]

# Explore preprocessing techniques
preproc_results = explore_preprocessing_techniques(sample_img)

# Explore normalization methods
norm_results = explore_normalization_methods(sample_img)

# Demonstrate character segmentation
# Note: This requires an image with multiple characters
# sample_text_img = Image.open("path/to/handwritten_text.png")
# segmentation_results = character_segmentation_demo(sample_text_img)
"""



In [None]:
# --- Cell 7: Complete Data Analysis Pipeline ---
"""
## Complete Data Analysis Pipeline

This section combines the previous analyses into a comprehensive pipeline for dataset exploration.
It demonstrates the complete process from data loading to preprocessing and augmentation.
"""

def run_complete_data_analysis(data_root, save_dir="data_analysis"):
    """
    Run a complete data analysis pipeline on the handwritten character dataset.
    
    Args:
        data_root: Path to the dataset root directory
        save_dir: Directory to save analysis results
    """
    print(f"Running complete data analysis pipeline on: {data_root}")
    
    # Step 1: Load and explore dataset
    print("\nStep 1: Loading and exploring dataset...")
    dataset, class_names, stats = load_and_explore_dataset(data_root)
    
    if dataset is None:
        print("Failed to load dataset. Aborting analysis.")
        return None
    
    # Step 2: Visualize class distribution
    print("\nStep 2: Visualizing class distribution...")
    visualize_class_distribution(stats, save_path=f"{save_dir}/statistics/class_distribution.png")
    
    # Step 3: Visualize sample images
    print("\nStep 3: Visualizing sample images...")
    visualize_sample_images(dataset, num_classes=min(10, len(class_names)), samples_per_class=5, 
                         save_path=f"{save_dir}/statistics/sample_images.png")
    
    # Step 4: Analyze image properties
    print("\nStep 4: Analyzing image properties...")
    image_properties = analyze_image_properties(dataset, sample_size=100, save_dir=f"{save_dir}/statistics")
    
    # Step 5: Extract stroke features
    print("\nStep 5: Extracting stroke features...")
    stroke_features = extract_stroke_features(dataset, sample_size=20, save_dir=f"{save_dir}/statistics")
    
    # Step 6: Visualize transformations on sample images
    print("\nStep 6: Visualizing transformations on sample images...")
    # Select a few samples for transformation visualization
    sample_indices = random.sample(range(len(dataset)), 2)
    
    for i, idx in enumerate(sample_indices):
        img, label = dataset[idx]
        class_name = dataset.classes[label]
        
        print(f"  Sample {i+1}: Class '{class_name}'")
        
        # Apply and visualize basic transformations
        basic_trans = apply_basic_transformations(
            img, save_dir=f"{save_dir}/transformations/sample_{i+1}_class_{class_name}"
        )
        
        # Apply and visualize affine transformations
        affine_trans = apply_affine_transformations(
            img, save_dir=f"{save_dir}/transformations/sample_{i+1}_class_{class_name}"
        )
    
    # Step 7: Visualize augmentation techniques
    print("\nStep 7: Visualizing augmentation techniques...")
    # Select a sample for augmentation visualization
    aug_idx = random.randint(0, len(dataset)-1)
    aug_img, aug_label = dataset[aug_idx]
    aug_class = dataset.classes[aug_label]
    
    print(f"  Using sample from class '{aug_class}'")
    basic_augs, pipeline_augs = visualize_augmentation_techniques(
        aug_img, save_dir=f"{save_dir}/augmentations/class_{aug_class}"
    )
    
    # Step 8: Explore preprocessing techniques
    print("\nStep 8: Exploring preprocessing techniques...")
    preproc_results = explore_preprocessing_techniques(
        aug_img, save_dir=f"{save_dir}/transformations/class_{aug_class}"
    )
    
    # Step 9: Explore normalization methods
    print("\nStep 9: Exploring normalization methods...")
    norm_results = explore_normalization_methods(
        aug_img, save_dir=f"{save_dir}/transformations/class_{aug_class}"
    )
    
    # Step 10: Generate summary report
    print("\nStep 10: Generating summary report...")
    with open(f"{save_dir}/data_analysis_summary.txt", 'w') as f:
        f.write("# Handwritten Character Recognition Dataset Analysis\n\n")
        
        f.write("## Dataset Summary\n")
        f.write(f"- Dataset path: {data_root}\n")
        f.write(f"- Total samples: {stats['total_samples']}\n")
        f.write(f"- Number of classes: {stats['num_classes']}\n")
        f.write(f"- Min samples per class: {stats['min_samples_per_class']}\n")
        f.write(f"- Max samples per class: {stats['max_samples_per_class']}\n")
        f.write(f"- Avg samples per class: {stats['avg_samples_per_class']:.2f}\n\n")
        
        f.write("## Image Properties\n")
        if image_properties:
            f.write(f"- Width (pixels): mean={image_properties['width']['mean']:.2f}, std={image_properties['width']['std']:.2f}\n")
            f.write(f"- Height (pixels): mean={image_properties['height']['mean']:.2f}, std={image_properties['height']['std']:.2f}\n")
            f.write(f"- Aspect Ratio (w/h): mean={image_properties['aspect_ratio']['mean']:.2f}, std={image_properties['aspect_ratio']['std']:.2f}\n")
            f.write(f"- Mean Intensity: mean={image_properties['mean_intensity']['mean']:.2f}, std={image_properties['mean_intensity']['std']:.2f}\n\n")
        
        f.write("## Stroke Features\n")
        if stroke_features:
            f.write(f"- Max Stroke Thickness: mean={stroke_features['stroke_thickness']['mean']:.2f} pixels, std={stroke_features['stroke_thickness']['std']:.2f}\n")
            f.write(f"- Number of Contours: mean={stroke_features['stroke_continuity']['mean']:.2f}, std={stroke_features['stroke_continuity']['std']:.2f}\n")
            f.write(f"- Character Density: mean={stroke_features['character_density']['mean']:.2f}, std={stroke_features['character_density']['std']:.2f}\n\n")
        
        f.write("## Analysis Results\n")
        f.write("- The dataset has been analyzed for various properties and characteristics.\n")
        f.write("- Visualizations of transformations, augmentations, and preprocessing techniques have been generated.\n")
        f.write("- Refer to the individual image files in the analysis directory for detailed visualizations.\n")
    
    print(f"\nData analysis complete. Results saved to {save_dir}")
    return {
        'dataset': dataset,
        'class_names': class_names,
        'stats': stats,
        'image_properties': image_properties,
        'stroke_features': stroke_features
    }

# Example usage (commented out)
"""
# Specify your dataset path
DATA_ROOT = "./datasets/handwritten-english/augmented_images1"

# Run the complete data analysis pipeline
analysis_results = run_complete_data_analysis(DATA_ROOT)
"""



In [None]:
# --- Cell 8: Run Data Analysis (User Code) ---
"""
## Run Data Analysis

This is where you run the actual data analysis pipeline with your dataset.
Uncomment and modify the code below to analyze your own dataset.
"""

# Define your dataset path
# DATA_ROOT = "./datasets/handwritten-english/augmented_images1"

# Option 1: Run the complete data analysis pipeline
"""
analysis_results = run_complete_data_analysis(DATA_ROOT)
"""

# Option 2: Run specific analyses
"""
# Load and explore the dataset
dataset, class_names, stats = load_and_explore_dataset(DATA_ROOT)

# Visualize class distribution
visualize_class_distribution(stats)

# Analyze image properties
image_properties = analyze_image_properties(dataset, sample_size=100)

# Extract and analyze stroke features
stroke_features = extract_stroke_features(dataset, sample_size=20)

# Select a sample image for detailed analysis
sample_idx = random.randint(0, len(dataset)-1)
sample_img, sample_label = dataset[sample_idx]
sample_class = dataset.classes[sample_label]
print(f"Selected sample from class '{sample_class}'")

# Visualize transformations
basic_trans = apply_basic_transformations(sample_img)
affine_trans = apply_affine_transformations(sample_img)

# Visualize augmentation techniques
basic_augs, pipeline_augs = visualize_augmentation_techniques(sample_img)

# Explore preprocessing techniques
preproc_results = explore_preprocessing_techniques(sample_img)

# Explore normalization methods
norm_results = explore_normalization_methods(sample_img)
"""

# Option 3: Character segmentation demo
"""
# Load a sample multi-character image
# This requires an image with multiple characters
# sample_text_img = Image.open("path/to/handwritten_text.png")
# segmentation_results = character_segmentation_demo(sample_text_img)
"""

print("This notebook is ready for data analysis of handwritten character recognition datasets.")
print("Uncomment one of the analysis options above and run this cell to start the analysis.")