# Exploratory Data Analysis (EDA) for Crop Disease Prediction

This notebook performs exploratory data analysis on the Plant Disease dataset, focusing on Fresno County-relevant crops.

## Objectives:
1. Visualize the dataset structure
2. Analyze class distributions
3. Explore sample images from each disease category
4. Identify class imbalances
5. Visualize disease vs. healthy distribution per crop


In [None]:
# Import required libraries
import os
import sys
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

import config

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries imported successfully")


## 1. Dataset Structure Analysis


In [None]:
# Analyze dataset structure
def analyze_dataset(data_dir):
    """Analyze the structure and statistics of the dataset."""
    dataset_info = defaultdict(lambda: defaultdict(int))
    
    for split in ['train', 'val', 'test']:
        split_dir = data_dir / split
        if not split_dir.exists():
            print(f"⚠ Warning: {split} directory not found")
            continue
        
        for class_dir in split_dir.iterdir():
            if class_dir.is_dir():
                class_name = class_dir.name
                image_files = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.png')) + list(class_dir.glob('*.jpeg'))
                dataset_info[split][class_name] = len(image_files)
    
    return dataset_info

# Get dataset statistics
data_dir = config.PROCESSED_DATA_DIR.parent
dataset_info = analyze_dataset(data_dir)

# Display summary
print("="*80)
print("DATASET SUMMARY")
print("="*80)
for split in ['train', 'val', 'test']:
    if split in dataset_info:
        total_images = sum(dataset_info[split].values())
        num_classes = len(dataset_info[split])
        print(f"\n{split.upper()}:")
        print(f"  Total images: {total_images:,}")
        print(f"  Number of classes: {num_classes}")
print("="*80)


## 2. Class Distribution Analysis


In [None]:
# Create DataFrame for easier visualization
def create_dataset_df(dataset_info):
    """Create DataFrame from dataset info."""
    data = []
    for split, classes in dataset_info.items():
        for class_name, count in classes.items():
            data.append({
                'split': split,
                'class': class_name,
                'count': count
            })
    return pd.DataFrame(data)

df = create_dataset_df(dataset_info)

# Plot class distribution for training set
if 'train' in dataset_info:
    train_df = df[df['split'] == 'train'].sort_values('count', ascending=True)
    
    plt.figure(figsize=(14, max(10, len(train_df) * 0.3)))
    colors = ['green' if 'healthy' in cls.lower() else 'red' for cls in train_df['class']]
    plt.barh(train_df['class'], train_df['count'], color=colors, alpha=0.7)
    plt.xlabel('Number of Images')
    plt.ylabel('Class')
    plt.title('Training Set: Class Distribution (Green=Healthy, Red=Diseased)', fontsize=14, fontweight='bold')
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"\nTraining Set Statistics:")
    print(f"  Mean images per class: {train_df['count'].mean():.1f}")
    print(f"  Median images per class: {train_df['count'].median():.1f}")
    print(f"  Min images: {train_df['count'].min()} ({train_df.loc[train_df['count'].idxmin(), 'class']})")
    print(f"  Max images: {train_df['count'].max()} ({train_df.loc[train_df['count'].idxmax(), 'class']})")


## 3. Crop-wise Analysis


In [None]:
# Analyze by crop type
def extract_crop_name(class_name):
    """Extract crop name from class name."""
    if '___' in class_name:
        return class_name.split('___')[0]
    elif '_' in class_name:
        return class_name.split('_')[0]
    return class_name

# Add crop column
df['crop'] = df['class'].apply(extract_crop_name)

# Analyze crops in training set
if 'train' in dataset_info:
    train_crop_stats = df[df['split'] == 'train'].groupby('crop').agg({
        'count': 'sum',
        'class': 'count'
    }).rename(columns={'count': 'total_images', 'class': 'num_classes'}).sort_values('total_images', ascending=False)
    
    print("\nCrop Statistics (Training Set):")
    print(train_crop_stats)
    
    # Plot crop distribution
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Images per crop
    train_crop_stats['total_images'].plot(kind='bar', ax=axes[0], color='steelblue', alpha=0.8)
    axes[0].set_xlabel('Crop')
    axes[0].set_ylabel('Number of Images')
    axes[0].set_title('Total Images per Crop', fontsize=12, fontweight='bold')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Classes per crop
    train_crop_stats['num_classes'].plot(kind='bar', ax=axes[1], color='coral', alpha=0.8)
    axes[1].set_xlabel('Crop')
    axes[1].set_ylabel('Number of Disease Classes')
    axes[1].set_title('Disease Classes per Crop', fontsize=12, fontweight='bold')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()


## 4. Healthy vs Diseased Distribution


In [None]:
# Classify as healthy or diseased
df['status'] = df['class'].apply(lambda x: 'Healthy' if 'healthy' in x.lower() else 'Diseased')

# Overall healthy vs diseased distribution
if 'train' in dataset_info:
    train_status = df[df['split'] == 'train'].groupby('status')['count'].sum()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Pie chart
    colors_pie = ['lightgreen', 'lightcoral']
    axes[0].pie(train_status.values, labels=train_status.index, autopct='%1.1f%%', 
                colors=colors_pie, startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
    axes[0].set_title('Overall: Healthy vs Diseased Images', fontsize=14, fontweight='bold')
    
    # Bar chart per crop
    train_crop_status = df[df['split'] == 'train'].groupby(['crop', 'status'])['count'].sum().unstack(fill_value=0)
    train_crop_status.plot(kind='bar', stacked=False, ax=axes[1], color=colors_pie, alpha=0.8)
    axes[1].set_xlabel('Crop')
    axes[1].set_ylabel('Number of Images')
    axes[1].set_title('Healthy vs Diseased Images per Crop', fontsize=14, fontweight='bold')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].legend(title='Status')
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nHealthy vs Diseased Summary (Training Set):")
    print(train_status)
    print(f"\nPercentage Diseased: {train_status['Diseased'] / train_status.sum() * 100:.1f}%")
    print(f"Percentage Healthy: {train_status['Healthy'] / train_status.sum() * 100:.1f}%")


## 5. Sample Image Visualization


In [None]:
# Visualize sample images from different classes
def visualize_samples(data_dir, split='train', num_classes=6, samples_per_class=4):
    """Visualize sample images from random classes."""
    split_dir = data_dir / split
    
    if not split_dir.exists():
        print(f"⚠ {split} directory not found")
        return
    
    # Get random classes
    all_classes = [d.name for d in split_dir.iterdir() if d.is_dir()]
    selected_classes = np.random.choice(all_classes, size=min(num_classes, len(all_classes)), replace=False)
    
    fig, axes = plt.subplots(num_classes, samples_per_class, figsize=(16, num_classes * 3))
    
    if num_classes == 1:
        axes = axes.reshape(1, -1)
    
    for i, class_name in enumerate(selected_classes):
        class_dir = split_dir / class_name
        image_files = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.png')) + list(class_dir.glob('*.jpeg'))
        
        # Select random samples
        sample_images = np.random.choice(image_files, size=min(samples_per_class, len(image_files)), replace=False)
        
        for j, img_path in enumerate(sample_images):
            img = Image.open(img_path)
            axes[i, j].imshow(img)
            axes[i, j].axis('off')
            
            if j == 0:
                axes[i, j].set_title(f"{class_name}\n({len(image_files)} images)", 
                                    fontsize=10, fontweight='bold', loc='left')
    
    plt.suptitle(f'Sample Images from {split.capitalize()} Set', fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.show()

# Visualize training samples
print("Visualizing sample images from training set...")
visualize_samples(data_dir, split='train', num_classes=8, samples_per_class=4)


## 6. Image Properties Analysis


In [None]:
# Analyze image properties (dimensions, aspect ratios)
def analyze_image_properties(data_dir, split='train', sample_size=1000):
    """Analyze properties of images in the dataset."""
    split_dir = data_dir / split
    
    if not split_dir.exists():
        print(f"⚠ {split} directory not found")
        return
    
    widths = []
    heights = []
    aspect_ratios = []
    file_sizes = []
    
    # Sample random images
    all_images = []
    for class_dir in split_dir.iterdir():
        if class_dir.is_dir():
            images = list(class_dir.glob('*.jpg')) + list(class_dir.glob('*.png')) + list(class_dir.glob('*.jpeg'))
            all_images.extend(images)
    
    sampled_images = np.random.choice(all_images, size=min(sample_size, len(all_images)), replace=False)
    
    print(f"Analyzing {len(sampled_images)} sample images...")
    
    for img_path in sampled_images:
        try:
            img = Image.open(img_path)
            w, h = img.size
            widths.append(w)
            heights.append(h)
            aspect_ratios.append(w / h)
            file_sizes.append(os.path.getsize(img_path) / 1024)  # KB
        except Exception as e:
            continue
    
    # Plot distributions
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    axes[0, 0].hist(widths, bins=30, color='steelblue', alpha=0.7, edgecolor='black')
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Image Width Distribution')
    axes[0, 0].grid(axis='y', alpha=0.3)
    
    axes[0, 1].hist(heights, bins=30, color='coral', alpha=0.7, edgecolor='black')
    axes[0, 1].set_xlabel('Height (pixels)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Image Height Distribution')
    axes[0, 1].grid(axis='y', alpha=0.3)
    
    axes[1, 0].hist(aspect_ratios, bins=30, color='green', alpha=0.7, edgecolor='black')
    axes[1, 0].set_xlabel('Aspect Ratio (W/H)')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Aspect Ratio Distribution')
    axes[1, 0].grid(axis='y', alpha=0.3)
    
    axes[1, 1].hist(file_sizes, bins=30, color='purple', alpha=0.7, edgecolor='black')
    axes[1, 1].set_xlabel('File Size (KB)')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('File Size Distribution')
    axes[1, 1].grid(axis='y', alpha=0.3)
    
    plt.suptitle(f'Image Properties Analysis ({split.capitalize()} Set)', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print(f"\nImage Properties Summary:")
    print(f"  Width:  Mean={np.mean(widths):.0f}, Median={np.median(widths):.0f}, Range=[{np.min(widths)}, {np.max(widths)}]")
    print(f"  Height: Mean={np.mean(heights):.0f}, Median={np.median(heights):.0f}, Range=[{np.min(heights)}, {np.max(heights)}]")
    print(f"  Aspect Ratio: Mean={np.mean(aspect_ratios):.2f}, Median={np.median(aspect_ratios):.2f}")
    print(f"  File Size (KB): Mean={np.mean(file_sizes):.1f}, Median={np.median(file_sizes):.1f}")

# Analyze image properties
analyze_image_properties(data_dir, split='train', sample_size=1000)


In [None]:
print("="*80)
print("KEY FINDINGS FROM EDA")
print("="*80)
print("""
1. Dataset Composition:
   - The dataset includes multiple Fresno-relevant crops with varying numbers of disease classes
   - Each crop has at least one healthy class and multiple disease classes
   
2. Class Imbalance:
   - Some classes have significantly more images than others
   - This may require techniques like class weighting or data augmentation during training
   
3. Disease Distribution:
   - Diseased samples outnumber healthy samples in most crops
   - This reflects real-world scenarios where disease identification is critical
   
4. Image Properties:
   - Images have varying dimensions but relatively consistent aspect ratios
   - Preprocessing will normalize images to a standard size (224x224) for the model
   
5. Visual Characteristics:
   - Diseased leaves show clear visual patterns: discoloration, spotting, texture irregularities
   - These features should be learnable by CNNs
   
NEXT STEPS:
- Proceed to model training with data augmentation to handle class imbalances
- Use transfer learning with pre-trained models (ResNet34, VGG16) for better performance
- Implement class weighting if needed
- Monitor per-class performance, especially for minority classes
""")
print("="*80)
