# üé≠ Facial Emotion Recognition - Exploratory Data Analysis (EDA)

**Purpose**: Understand the dataset before building models

**What we'll explore**:
1. Dataset structure and organization
2. Class distribution (balanced or imbalanced?)
3. Image properties (size, quality, format)
4. Visual inspection of samples
5. Data quality issues
6. Insights for preprocessing and training

**Date**: `YYYY-MM-DD`  
**Author**: Your Name

## üì¶ Setup & Imports

In [1]:
# Standard library imports
import os
import sys
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle

# Image processing
from PIL import Image
import cv2

# Configure visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set random seed for reproducibility
np.random.seed(42)

print("‚úÖ All imports successful!")

‚úÖ All imports successful!


## üóÇÔ∏è 1. Dataset Structure & Organization

In [None]:
# Define paths
DATA_DIR = Path('../data/raw')  # Adjust this path based on your setup

# Check if data directory exists
if not DATA_DIR.exists():
    print(f"‚ùå Data directory not found: {DATA_DIR}")
    print("Please download the dataset first using: make data")
else:
    print(f"‚úÖ Data directory found: {DATA_DIR}")
    
# List all subdirectories
subdirs = [d for d in DATA_DIR.iterdir() if d.is_dir()]
print(f"\nüìÅ Found {len(subdirs)} subdirectories:")
for subdir in sorted(subdirs):
    print(f"   - {subdir.name}")

In [None]:
# Explore directory structure in detail
def explore_directory_structure(root_path):
    """Recursively explore directory structure"""
    structure = {}
    
    for dirpath, dirnames, filenames in os.walk(root_path):
        rel_path = Path(dirpath).relative_to(root_path)
        structure[str(rel_path)] = {
            'num_subdirs': len(dirnames),
            'num_files': len(filenames),
            'subdirs': dirnames,
            'file_extensions': list(set([Path(f).suffix for f in filenames]))
        }
    
    return structure

structure = explore_directory_structure(DATA_DIR)

print("\nüìä Directory Structure Analysis:")
print("=" * 60)
for path, info in list(structure.items())[:10]:  # Show first 10
    print(f"\n{path}:")
    print(f"  Subdirectories: {info['num_subdirs']}")
    print(f"  Files: {info['num_files']}")
    if info['subdirs']:
        print(f"  Contains: {', '.join(info['subdirs'][:5])}")
    if info['file_extensions']:
        print(f"  File types: {', '.join(info['file_extensions'])}")

## üìä 2. Dataset Statistics & Class Distribution

In [None]:
# Define emotion classes (adjust based on your dataset)
EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

def collect_dataset_info(data_path, splits=['train', 'test', 'validation']):
    """Collect comprehensive dataset information"""
    
    dataset_info = {}
    
    for split in splits:
        split_path = data_path / split
        
        if not split_path.exists():
            print(f"‚ö†Ô∏è  Split '{split}' not found at {split_path}")
            continue
        
        split_info = {'total': 0, 'classes': {}}
        
        for emotion in EMOTIONS:
            emotion_path = split_path / emotion
            
            if emotion_path.exists():
                images = list(emotion_path.glob('*.jpg')) + \
                        list(emotion_path.glob('*.png')) + \
                        list(emotion_path.glob('*.jpeg'))
                
                count = len(images)
                split_info['classes'][emotion] = count
                split_info['total'] += count
            else:
                split_info['classes'][emotion] = 0
        
        dataset_info[split] = split_info
    
    return dataset_info

# Collect information
dataset_info = collect_dataset_info(DATA_DIR)

# Display results
print("\n" + "="*70)
print("üìà DATASET STATISTICS")
print("="*70)

for split, info in dataset_info.items():
    print(f"\n{split.upper()} SET: {info['total']:,} images")
    print("-" * 40)
    for emotion, count in info['classes'].items():
        percentage = (count / info['total'] * 100) if info['total'] > 0 else 0
        print(f"  {emotion:12s}: {count:5,} ({percentage:5.2f}%)")

# Calculate total
total_images = sum(info['total'] for info in dataset_info.values())
print(f"\n{'='*40}")
print(f"TOTAL DATASET: {total_images:,} images")
print(f"{'='*40}")

In [None]:
# Create a comprehensive DataFrame for easier analysis
data_records = []

for split, info in dataset_info.items():
    for emotion, count in info['classes'].items():
        data_records.append({
            'split': split,
            'emotion': emotion,
            'count': count,
            'percentage': (count / info['total'] * 100) if info['total'] > 0 else 0
        })

df_stats = pd.DataFrame(data_records)

print("\nüìä Dataset Statistics DataFrame:")
print(df_stats.head(10))

# Summary statistics
print("\nüìà Summary by Split:")
print(df_stats.groupby('split')['count'].agg(['sum', 'mean', 'std', 'min', 'max']))

print("\nüìà Summary by Emotion (across all splits):")
print(df_stats.groupby('emotion')['count'].agg(['sum', 'mean', 'std']))

## üìà 3. Visualize Class Distribution

In [None]:
# Create comprehensive visualization of class distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Class Distribution Analysis', fontsize=20, fontweight='bold')

# 1. Bar plot by split
ax1 = axes[0, 0]
df_pivot = df_stats.pivot(index='emotion', columns='split', values='count')
df_pivot.plot(kind='bar', ax=ax1, width=0.8)
ax1.set_title('Images per Emotion by Split', fontsize=14, fontweight='bold')
ax1.set_xlabel('Emotion', fontsize=12)
ax1.set_ylabel('Number of Images', fontsize=12)
ax1.legend(title='Split', fontsize=10)
ax1.grid(axis='y', alpha=0.3)
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

# 2. Pie chart - overall distribution
ax2 = axes[0, 1]
emotion_totals = df_stats.groupby('emotion')['count'].sum()
colors = sns.color_palette('husl', len(emotion_totals))
wedges, texts, autotexts = ax2.pie(emotion_totals, labels=emotion_totals.index, 
                                     autopct='%1.1f%%', startangle=90, colors=colors)
ax2.set_title('Overall Emotion Distribution', fontsize=14, fontweight='bold')
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(10)
    autotext.set_fontweight('bold')

# 3. Stacked bar chart
ax3 = axes[1, 0]
df_pivot_pct = df_pivot.div(df_pivot.sum(axis=1), axis=0) * 100
df_pivot_pct.plot(kind='bar', stacked=True, ax=ax3, width=0.8)
ax3.set_title('Percentage Distribution by Split', fontsize=14, fontweight='bold')
ax3.set_xlabel('Emotion', fontsize=12)
ax3.set_ylabel('Percentage', fontsize=12)
ax3.legend(title='Split', fontsize=10)
ax3.grid(axis='y', alpha=0.3)
plt.setp(ax3.xaxis.get_majorticklabels(), rotation=45, ha='right')

# 4. Box plot - distribution across splits
ax4 = axes[1, 1]
df_stats.boxplot(column='count', by='split', ax=ax4)
ax4.set_title('Distribution of Class Sizes by Split', fontsize=14, fontweight='bold')
ax4.set_xlabel('Split', fontsize=12)
ax4.set_ylabel('Number of Images', fontsize=12)
plt.suptitle('')  # Remove auto title from boxplot

plt.tight_layout()
plt.savefig('../results/plots/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("üíæ Saved visualization to: ../results/plots/class_distribution.png")

In [None]:
# Check for class imbalance
print("\n‚öñÔ∏è  CLASS IMBALANCE ANALYSIS")
print("="*60)

for split, info in dataset_info.items():
    counts = list(info['classes'].values())
    if counts:
        max_count = max(counts)
        min_count = min(counts)
        imbalance_ratio = max_count / min_count if min_count > 0 else float('inf')
        
        print(f"\n{split.upper()} Set:")
        print(f"  Max class size: {max_count:,}")
        print(f"  Min class size: {min_count:,}")
        print(f"  Imbalance ratio: {imbalance_ratio:.2f}:1")
        
        if imbalance_ratio > 3:
            print("  ‚ö†Ô∏è  SIGNIFICANT IMBALANCE - Consider using:")
            print("     - Class weights")
            print("     - Focal loss")
            print("     - Oversampling minority classes")
        elif imbalance_ratio > 1.5:
            print("  ‚ö†Ô∏è  MODERATE IMBALANCE - Monitor class-wise metrics")
        else:
            print("  ‚úÖ WELL BALANCED")

## üñºÔ∏è 4. Image Properties Analysis

In [None]:
# Analyze image properties (sample from each class)
def analyze_image_properties(data_path, split='train', samples_per_class=50):
    """Analyze dimensions, file sizes, and formats"""
    
    results = []
    
    for emotion in EMOTIONS:
        emotion_path = data_path / split / emotion
        
        if not emotion_path.exists():
            continue
        
        # Get sample of images
        images = list(emotion_path.glob('*.jpg')) + \
                list(emotion_path.glob('*.png')) + \
                list(emotion_path.glob('*.jpeg'))
        
        sample = np.random.choice(images, min(samples_per_class, len(images)), replace=False)
        
        for img_path in sample:
            try:
                img = Image.open(img_path)
                width, height = img.size
                channels = len(img.getbands())
                file_size = os.path.getsize(img_path) / 1024  # KB
                
                results.append({
                    'emotion': emotion,
                    'width': width,
                    'height': height,
                    'aspect_ratio': width / height,
                    'channels': channels,
                    'total_pixels': width * height,
                    'file_size_kb': file_size,
                    'format': img.format
                })
            except Exception as e:
                print(f"‚ùå Error processing {img_path}: {e}")
    
    return pd.DataFrame(results)

print("üîç Analyzing image properties (this may take a moment)...")
df_images = analyze_image_properties(DATA_DIR, split='train', samples_per_class=100)

print("\nüìä Image Properties Summary:")
print("="*70)
print(f"Total images analyzed: {len(df_images):,}")
print(f"\nDimensions:")
print(f"  Width:  {df_images['width'].min():.0f} - {df_images['width'].max():.0f} px (mean: {df_images['width'].mean():.1f})")
print(f"  Height: {df_images['height'].min():.0f} - {df_images['height'].max():.0f} px (mean: {df_images['height'].mean():.1f})")
print(f"  Aspect ratio: {df_images['aspect_ratio'].min():.2f} - {df_images['aspect_ratio'].max():.2f} (mean: {df_images['aspect_ratio'].mean():.2f})")
print(f"\nFile properties:")
print(f"  File size: {df_images['file_size_kb'].min():.1f} - {df_images['file_size_kb'].max():.1f} KB (mean: {df_images['file_size_kb'].mean():.1f} KB)")
print(f"  Channels: {df_images['channels'].value_counts().to_dict()}")
print(f"  Formats: {df_images['format'].value_counts().to_dict()}")

# Detailed statistics
print("\nüìà Detailed Statistics:")
print(df_images[['width', 'height', 'aspect_ratio', 'file_size_kb']].describe())

In [None]:
# Visualize image properties
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Image Properties Analysis', fontsize=18, fontweight='bold')

# 1. Width distribution
axes[0, 0].hist(df_images['width'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(df_images['width'].mean(), color='red', linestyle='--', label=f"Mean: {df_images['width'].mean():.0f}")
axes[0, 0].set_title('Width Distribution')
axes[0, 0].set_xlabel('Width (pixels)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Height distribution
axes[0, 1].hist(df_images['height'], bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].axvline(df_images['height'].mean(), color='red', linestyle='--', label=f"Mean: {df_images['height'].mean():.0f}")
axes[0, 1].set_title('Height Distribution')
axes[0, 1].set_xlabel('Height (pixels)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# 3. Aspect ratio
axes[0, 2].hist(df_images['aspect_ratio'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[0, 2].axvline(df_images['aspect_ratio'].mean(), color='red', linestyle='--', label=f"Mean: {df_images['aspect_ratio'].mean():.2f}")
axes[0, 2].set_title('Aspect Ratio Distribution')
axes[0, 2].set_xlabel('Aspect Ratio (W/H)')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].legend()
axes[0, 2].grid(alpha=0.3)

# 4. File size
axes[1, 0].hist(df_images['file_size_kb'], bins=30, edgecolor='black', alpha=0.7, color='purple')
axes[1, 0].axvline(df_images['file_size_kb'].mean(), color='red', linestyle='--', label=f"Mean: {df_images['file_size_kb'].mean():.1f} KB")
axes[1, 0].set_title('File Size Distribution')
axes[1, 0].set_xlabel('File Size (KB)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# 5. Scatter: width vs height
scatter = axes[1, 1].scatter(df_images['width'], df_images['height'], 
                             c=df_images['emotion'].astype('category').cat.codes, 
                             alpha=0.5, cmap='tab10')
axes[1, 1].plot([0, max(df_images['width'].max(), df_images['height'].max())],
                [0, max(df_images['width'].max(), df_images['height'].max())], 
                'r--', alpha=0.5, label='Square (1:1)')
axes[1, 1].set_title('Width vs Height')
axes[1, 1].set_xlabel('Width (pixels)')
axes[1, 1].set_ylabel('Height (pixels)')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

# 6. Box plot by emotion
df_images.boxplot(column='total_pixels', by='emotion', ax=axes[1, 2])
axes[1, 2].set_title('Image Size by Emotion')
axes[1, 2].set_xlabel('Emotion')
axes[1, 2].set_ylabel('Total Pixels')
plt.setp(axes[1, 2].xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.savefig('../results/plots/image_properties.png', dpi=300, bbox_inches='tight')
plt.show()

print("üíæ Saved visualization to: ../results/plots/image_properties.png")

## üîç 5. Visual Inspection of Sample Images

In [None]:
# Display sample images from each emotion class
def display_emotion_samples(data_path, split='train', samples_per_emotion=5):
    """Display random samples from each emotion class"""
    
    fig, axes = plt.subplots(len(EMOTIONS), samples_per_emotion, 
                            figsize=(samples_per_emotion * 3, len(EMOTIONS) * 3))
    fig.suptitle(f'Sample Images from {split.upper()} Set', 
                fontsize=20, fontweight='bold', y=0.995)
    
    for i, emotion in enumerate(EMOTIONS):
        emotion_path = data_path / split / emotion
        
        if not emotion_path.exists():
            continue
        
        # Get random samples
        images = list(emotion_path.glob('*.jpg')) + \
                list(emotion_path.glob('*.png')) + \
                list(emotion_path.glob('*.jpeg'))
        
        if len(images) == 0:
            continue
            
        sample = np.random.choice(images, min(samples_per_emotion, len(images)), replace=False)
        
        for j, img_path in enumerate(sample):
            try:
                img = Image.open(img_path)
                
                if len(axes.shape) == 1:
                    ax = axes[j]
                else:
                    ax = axes[i, j]
                
                ax.imshow(img)
                ax.axis('off')
                
                # Add label on the first column
                if j == 0:
                    ax.set_ylabel(emotion.upper(), fontsize=14, fontweight='bold', rotation=0, 
                                 ha='right', va='center')
                
                # Add image info
                ax.set_title(f"{img.size[0]}x{img.size[1]}", fontsize=9)
                
            except Exception as e:
                print(f"Error loading {img_path}: {e}")
    
    plt.tight_layout()
    plt.savefig('../results/plots/emotion_samples.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("üíæ Saved visualization to: ../results/plots/emotion_samples.png")

display_emotion_samples(DATA_DIR, split='train', samples_per_emotion=6)

## üé® 6. Image Quality Analysis

In [None]:
# Analyze brightness, contrast, and color distribution
def analyze_image_quality(data_path, split='train', samples=100):
    """Analyze brightness, contrast, and color properties"""
    
    results = []
    
    for emotion in EMOTIONS:
        emotion_path = data_path / split / emotion
        
        if not emotion_path.exists():
            continue
        
        images = list(emotion_path.glob('*.jpg')) + \
                list(emotion_path.glob('*.png')) + \
                list(emotion_path.glob('*.jpeg'))
        
        if len(images) == 0:
            continue
            
        sample = np.random.choice(images, min(samples, len(images)), replace=False)
        
        for img_path in sample:
            try:
                img = cv2.imread(str(img_path))
                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                
                # Calculate metrics
                brightness = np.mean(img_gray)
                contrast = np.std(img_gray)
                
                # Color channel means
                r_mean = np.mean(img_rgb[:, :, 0])
                g_mean = np.mean(img_rgb[:, :, 1])
                b_mean = np.mean(img_rgb[:, :, 2])
                
                results.append({
                    'emotion': emotion,
                    'brightness': brightness,
                    'contrast': contrast,
                    'red_mean': r_mean,
                    'green_mean': g_mean,
                    'blue_mean': b_mean
                })
                
            except Exception as e:
                print(f"Error processing {img_path}: {e}")
    
    return pd.DataFrame(results)

print("üé® Analyzing image quality...")
df_quality = analyze_image_quality(DATA_DIR, split='train', samples=100)

print("\nüìä Image Quality Summary:")
print("="*70)
print(f"\nBrightness (0-255):")
print(f"  Mean: {df_quality['brightness'].mean():.2f}")
print(f"  Std:  {df_quality['brightness'].std():.2f}")
print(f"  Range: {df_quality['brightness'].min():.2f} - {df_quality['brightness'].max():.2f}")

print(f"\nContrast (std of pixel values):")
print(f"  Mean: {df_quality['contrast'].mean():.2f}")
print(f"  Std:  {df_quality['contrast'].std():.2f}")
print(f"  Range: {df_quality['contrast'].min():.2f} - {df_quality['contrast'].max():.2f}")

print(f"\nColor Channel Means:")
print(f"  Red:   {df_quality['red_mean'].mean():.2f}")
print(f"  Green: {df_quality['green_mean'].mean():.2f}")
print(f"  Blue:  {df_quality['blue_mean'].mean():.2f}")

In [None]:
# Visualize quality metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Image Quality Analysis', fontsize=18, fontweight='bold')

# 1. Brightness distribution by emotion
df_quality.boxplot(column='brightness', by='emotion', ax=axes[0, 0])
axes[0, 0].set_title('Brightness by Emotion')
axes[0, 0].set_xlabel('Emotion')
axes[0, 0].set_ylabel('Brightness (0-255)')
plt.setp(axes[0, 0].xaxis.get_majorticklabels(), rotation=45, ha='right')

# 2. Contrast distribution by emotion
df_quality.boxplot(column='contrast', by='emotion', ax=axes[0, 1])
axes[0, 1].set_title('Contrast by Emotion')
axes[0, 1].set_xlabel('Emotion')
axes[0, 1].set_ylabel('Contrast (Std)')
plt.setp(axes[0, 1].xaxis.get_majorticklabels(), rotation=45, ha='right')

# 3. Color distribution
color_means = df_quality[['red_mean', 'green_mean', 'blue_mean']].mean()
axes[1, 0].bar(['Red', 'Green', 'Blue'], color_means, color=['red', 'green', 'blue'], alpha=0.7)
axes[1, 0].set_title('Average Color Channel Values')
axes[1, 0].set_ylabel('Mean Value (0-255)')
axes[1, 0].grid(axis='y', alpha=0.3)

# 4. Brightness vs Contrast scatter
scatter = axes[1, 1].scatter(df_quality['brightness'], df_quality['contrast'],
                            c=df_quality['emotion'].astype('category').cat.codes,
                            alpha=0.5, cmap='tab10')
axes[1, 1].set_title('Brightness vs Contrast')
axes[1, 1].set_xlabel('Brightness')
axes[1, 1].set_ylabel('Contrast')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/plots/image_quality.png', dpi=300, bbox_inches='tight')
plt.show()

print("üíæ Saved visualization to: ../results/plots/image_quality.png")

## üö® 7. Data Quality Issues

In [None]:
# Check for potential data issues
print("üîç Checking for Data Quality Issues...")
print("="*70)

issues = []

# 1. Check for very small images
small_images = df_images[(df_images['width'] < 48) | (df_images['height'] < 48)]
if len(small_images) > 0:
    issues.append(f"‚ö†Ô∏è  Found {len(small_images)} images smaller than 48x48 pixels")
    print(f"\n{issues[-1]}")
    print(f"   Consider removing or excluding these from training")

# 2. Check for extreme aspect ratios
extreme_ar = df_images[(df_images['aspect_ratio'] < 0.5) | (df_images['aspect_ratio'] > 2.0)]
if len(extreme_ar) > 0:
    issues.append(f"‚ö†Ô∏è  Found {len(extreme_ar)} images with extreme aspect ratios")
    print(f"\n{issues[-1]}")
    print(f"   These might be cropped incorrectly")

# 3. Check for grayscale images in RGB dataset
if 'channels' in df_images.columns:
    grayscale = df_images[df_images['channels'] == 1]
    if len(grayscale) > 0:
        issues.append(f"‚ö†Ô∏è  Found {len(grayscale)} grayscale images")
        print(f"\n{issues[-1]}")
        print(f"   Consider converting to RGB for consistency")

# 4. Check for very dark or bright images
if len(df_quality) > 0:
    very_dark = df_quality[df_quality['brightness'] < 30]
    very_bright = df_quality[df_quality['brightness'] > 225]
    
    if len(very_dark) > 0:
        issues.append(f"‚ö†Ô∏è  Found {len(very_dark)} very dark images (brightness < 30)")
        print(f"\n{issues[-1]}")
    
    if len(very_bright) > 0:
        issues.append(f"‚ö†Ô∏è  Found {len(very_bright)} very bright images (brightness > 225)")
        print(f"\n{issues[-1]}")

# 5. Check for low contrast images
if len(df_quality) > 0:
    low_contrast = df_quality[df_quality['contrast'] < 20]
    if len(low_contrast) > 0:
        issues.append(f"‚ö†Ô∏è  Found {len(low_contrast)} low contrast images (std < 20)")
        print(f"\n{issues[-1]}")
        print(f"   These might be blurry or have poor quality")

if len(issues) == 0:
    print("\n‚úÖ No major data quality issues detected!")
else:
    print(f"\n\nüìã Summary: Found {len(issues)} potential issues")
    print("   Review these and consider preprocessing steps to address them")

## üìù 8. Key Findings & Recommendations

In [None]:
# Generate comprehensive summary report
print("\n" + "="*70)
print("üìä KEY FINDINGS & RECOMMENDATIONS")
print("="*70)

print("\n1Ô∏è‚É£  DATASET SIZE:")
for split, info in dataset_info.items():
    print(f"   - {split.upper()}: {info['total']:,} images")
total = sum(info['total'] for info in dataset_info.values())
print(f"   - TOTAL: {total:,} images")

print("\n2Ô∏è‚É£  CLASS BALANCE:")
for split, info in dataset_info.items():
    counts = list(info['classes'].values())
    if counts:
        imbalance_ratio = max(counts) / min(counts) if min(counts) > 0 else float('inf')
        print(f"   - {split.upper()}: {imbalance_ratio:.2f}:1 ratio")
        if imbalance_ratio > 2:
            print(f"     ‚ö†Ô∏è  RECOMMENDATION: Use class weights or focal loss")

print("\n3Ô∏è‚É£  IMAGE PROPERTIES:")
print(f"   - Size range: {df_images['width'].min():.0f}x{df_images['height'].min():.0f} to "
      f"{df_images['width'].max():.0f}x{df_images['height'].max():.0f}")
print(f"   - Average size: {df_images['width'].mean():.0f}x{df_images['height'].mean():.0f}")
print(f"   ‚úÖ RECOMMENDATION: Resize all images to 224x224 for training")

print("\n4Ô∏è‚É£  DATA AUGMENTATION:")
print(f"   ‚úÖ RECOMMENDED techniques:")
print(f"      - Random horizontal flips (faces are mostly symmetric)")
print(f"      - Small rotations (¬±15¬∞)")
print(f"      - Brightness/contrast adjustments")
print(f"      - Random crops")
print(f"   ‚ö†Ô∏è  AVOID:")
print(f"      - Vertical flips (unnatural)")
print(f"      - Large rotations (>20¬∞)")
print(f"      - Heavy color distortions")

print("\n5Ô∏è‚É£  PREPROCESSING PIPELINE:")
print(f"   1. Resize to 224x224")
print(f"   2. Convert to RGB (if needed)")
print(f"   3. Normalize with ImageNet stats")
print(f"   4. Apply augmentation (training only)")

print("\n6Ô∏è‚É£  MODEL RECOMMENDATIONS:")
print(f"   - Start with: Baseline CNN (fast iteration)")
print(f"   - Best results: EfficientNet-B0 (pretrained)")
print(f"   - Alternative: ResNet50 (proven architecture)")

print("\n7Ô∏è‚É£  TRAINING STRATEGY:")
if any('imbalance' in str(i).lower() for i in issues):
    print(f"   ‚ö†Ô∏è  Due to class imbalance:")
    print(f"      - Use weighted loss or focal loss")
    print(f"      - Monitor per-class metrics")
    print(f"      - Consider oversampling minority classes")
else:
    print(f"   ‚úÖ Classes are balanced, standard training should work well")

print("\n" + "="*70)
print("‚úÖ EDA COMPLETE - Ready to start preprocessing and training!")
print("="*70)

## üíæ 9. Save Summary Report

In [None]:
# Save summary statistics to file
summary_path = Path('../results/reports/eda_summary.txt')
summary_path.parent.mkdir(parents=True, exist_ok=True)

with open(summary_path, 'w') as f:
    f.write("FACIAL EMOTION RECOGNITION - EDA SUMMARY\n")
    f.write("="*70 + "\n\n")
    
    f.write("DATASET STATISTICS:\n")
    f.write("-"*70 + "\n")
    for split, info in dataset_info.items():
        f.write(f"\n{split.upper()} SET: {info['total']:,} images\n")
        for emotion, count in info['classes'].items():
            percentage = (count / info['total'] * 100) if info['total'] > 0 else 0
            f.write(f"  {emotion:12s}: {count:5,} ({percentage:5.2f}%)\n")
    
    f.write("\n" + "="*70 + "\n")
    f.write(f"TOTAL: {sum(info['total'] for info in dataset_info.values()):,} images\n")
    
    f.write("\n\nIMAGE PROPERTIES:\n")
    f.write("-"*70 + "\n")
    f.write(f"Width:  {df_images['width'].min():.0f} - {df_images['width'].max():.0f} px\n")
    f.write(f"Height: {df_images['height'].min():.0f} - {df_images['height'].max():.0f} px\n")
    f.write(f"Average: {df_images['width'].mean():.0f}x{df_images['height'].mean():.0f}\n")
    
    if len(issues) > 0:
        f.write("\n\nDATA QUALITY ISSUES:\n")
        f.write("-"*70 + "\n")
        for issue in issues:
            f.write(f"{issue}\n")
    
    f.write("\n\nRECOMMENDATIONS:\n")
    f.write("-"*70 + "\n")
    f.write("1. Resize all images to 224x224\n")
    f.write("2. Use ImageNet normalization\n")
    f.write("3. Apply data augmentation\n")
    f.write("4. Start with EfficientNet-B0\n")
    f.write("5. Monitor class-wise metrics\n")

print(f"\nüíæ Saved EDA summary to: {summary_path}")

# Save DataFrames
df_stats.to_csv('../results/metrics/class_distribution.csv', index=False)
df_images.to_csv('../results/metrics/image_properties.csv', index=False)
if len(df_quality) > 0:
    df_quality.to_csv('../results/metrics/image_quality.csv', index=False)

print("üíæ Saved CSV files to: ../results/metrics/")
print("\n‚úÖ EDA notebook complete! You're ready to move to preprocessing.")

## üéØ Next Steps

Based on this analysis:

1. **Preprocessing** (`02_preprocessing.ipynb`)
   - Implement image resizing
   - Setup data augmentation
   - Create train/val/test splits

2. **Baseline Model** (`03_baseline_model.ipynb`)
   - Build simple CNN
   - Establish baseline performance
   - Identify areas for improvement

3. **Advanced Models** (`04_model_experiments.ipynb`)
   - Transfer learning (ResNet, EfficientNet)
   - Hyperparameter tuning
   - Ensemble methods

Good luck! üöÄ