# Data Exploration and Analysis

This notebook explores the dataset for multi-class image classification project.

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from config import Config
from data_preprocessing import DataPreprocessor
from utils import set_random_seed

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
%matplotlib inline

## Dataset Overview

In [None]:
# Initialize configuration
config = Config()
set_random_seed(config.RANDOM_SEED)

# Initialize preprocessor
preprocessor = DataPreprocessor(config)

print(f'Dataset directory: {config.DATA_DIR}')
print(f'Image size: {config.IMAGE_SIZE}')
print(f'Number of classes: {config.NUM_CLASSES}')

In [None]:
# Analyze dataset structure
train_gen, val_gen, test_gen = preprocessor.create_data_generators()

if train_gen:
    print('Training Data Analysis:')
    preprocessor.analyze_dataset(train_gen)
    
    print('\nCreating sample visualization...')
    preprocessor.create_sample_batch_visualization(train_gen)

## Data Augmentation Examples

In [None]:
# Find a sample image for augmentation demo
sample_images = list(config.TRAIN_DIR.rglob('*.jpg'))
if sample_images:
    sample_image = sample_images[0]
    print(f'Using sample image: {sample_image}')
    
    # Visualize augmentations
    preprocessor.visualize_augmentations(sample_image, num_augmentations=9)
else:
    print('No sample images found for augmentation demo')

## Class Distribution Analysis

In [None]:
# Analyze class distribution
if train_gen:
    class_counts = {}
    for class_name, class_idx in train_gen.class_indices.items():
        class_dir = Path(train_gen.directory) / class_name
        count = len(list(class_dir.glob('*.[jp][pn]g')) + list(class_dir.glob('*.jpeg')))
        class_counts[class_name] = count
    
    # Create visualization
    plt.figure(figsize=(12, 6))
    classes = list(class_counts.keys())
    counts = list(class_counts.values())
    
    plt.bar(classes, counts)
    plt.title('Class Distribution in Training Data')
    plt.xlabel('Class')
    plt.ylabel('Number of Images')
    plt.xticks(rotation=45)
    
    # Add count labels
    for i, count in enumerate(counts):
        plt.text(i, count + max(counts)*0.01, str(count), ha='center')
    
    plt.tight_layout()
    plt.show()
    
    print(f'Total training images: {sum(counts)}')
    print(f'Average images per class: {sum(counts)/len(counts):.1f}')
    print(f'Min images in a class: {min(counts)}')
    print(f'Max images in a class: {max(counts)}')

## Image Statistics

In [None]:
# Analyze image properties
import cv2
from tqdm.notebook import tqdm

def analyze_image_properties(image_dir, max_images=1000):
    '''Analyze basic image properties'''
    widths, heights, channels = [], [], []
    
    image_files = list(image_dir.rglob('*.[jp][pn]g'))[:max_images]
    
    for img_path in tqdm(image_files, desc='Analyzing images'):
        try:
            img = cv2.imread(str(img_path))
            if img is not None:
                h, w, c = img.shape
                widths.append(w)
                heights.append(h)
                channels.append(c)
        except Exception as e:
            print(f'Error processing {img_path}: {e}')
    
    return widths, heights, channels

# Analyze training images
if config.TRAIN_DIR.exists():
    widths, heights, channels = analyze_image_properties(config.TRAIN_DIR)
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    axes[0].hist(widths, bins=30, alpha=0.7)
    axes[0].set_title('Width Distribution')
    axes[0].set_xlabel('Width (pixels)')
    
    axes[1].hist(heights, bins=30, alpha=0.7)
    axes[1].set_title('Height Distribution')
    axes[1].set_xlabel('Height (pixels)')
    
    axes[2].hist(channels, bins=10, alpha=0.7)
    axes[2].set_title('Channel Distribution')
    axes[2].set_xlabel('Number of Channels')
    
    plt.tight_layout()
    plt.show()
    
    print(f'Width - Min: {min(widths)}, Max: {max(widths)}, Mean: {np.mean(widths):.1f}')
    print(f'Height - Min: {min(heights)}, Max: {max(heights)}, Mean: {np.mean(heights):.1f}')
    print(f'Channels - Unique values: {set(channels)}')