# Lab 02: Data Scaling Experiments - Practical Implementation

In this notebook, we'll implement controlled experiments to understand how dataset size affects model performance. We'll train the same model on 10% and 20% of the data and discover the point of diminishing returns.

## What We'll Do

1. **Download two dataset sizes** (10% and 20%)
2. **Train identical models** on each dataset
3. **Track experiments** with TensorBoard
4. **Compare results** statistically
5. **Analyze ROI** of data collection

## 1. Setup and Imports

In [None]:
# Core imports
import torch
import torchvision
from torch import nn
from torchvision import transforms
from torch.utils.tensorboard import SummaryWriter

# Utilities
import os
import zipfile
import requests
from pathlib import Path
from datetime import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
import pandas as pd

# Check PyTorch version
print(f"PyTorch Version: {torch.__version__}")
print(f"TorchVision Version: {torchvision.__version__}")

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

## 2. Download Helper Modules

In [None]:
# Download helper scripts if not present
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path.cwd().parent))

# Check if going_modular exists
if not Path("going_modular").exists():
    print("Downloading helper modules...")
    !git clone https://github.com/mrdbourke/pytorch-deep-learning
    !mv pytorch-deep-learning/going_modular .
    !rm -rf pytorch-deep-learning
    print("Helper modules downloaded!")
else:
    print("Helper modules already exist.")

# Import helper functions
from going_modular import data_setup, engine, utils

## 3. Set Random Seeds for Reproducibility

In [None]:
def set_seeds(seed: int = 42):
    """Set random seeds for reproducibility."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set initial seed
set_seeds(42)
print("Random seeds set for reproducibility")

## 4. Data Download Functions

In [None]:
def download_data(url: str, destination_name: str) -> Path:
    """Downloads and extracts a dataset.
    
    Args:
        url: URL to download from
        destination_name: Name for the extracted folder
    
    Returns:
        Path to the extracted data
    """
    data_path = Path("data/")
    image_path = data_path / destination_name
    
    if image_path.is_dir():
        print(f"[INFO] {image_path} already exists, skipping download.")
        return image_path
    
    print(f"[INFO] Creating {image_path} directory...")
    image_path.mkdir(parents=True, exist_ok=True)
    
    # Download the data
    zip_name = destination_name + ".zip"
    print(f"[INFO] Downloading {destination_name}...")
    with open(data_path / zip_name, "wb") as f:
        request = requests.get(url)
        f.write(request.content)
    
    # Extract it
    with zipfile.ZipFile(data_path / zip_name, "r") as zip_ref:
        print(f"[INFO] Unzipping {destination_name}...")
        zip_ref.extractall(data_path)
    
    # Clean up zip file
    os.remove(data_path / zip_name)
    
    return image_path

## 5. Download Both Dataset Sizes

In [None]:
# Download 10% dataset (225 training images)
data_10_percent = download_data(
    url="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
    destination_name="pizza_steak_sushi_10_percent"
)

# Download 20% dataset (450 training images)
data_20_percent = download_data(
    url="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip",
    destination_name="pizza_steak_sushi_20_percent"
)

print(f"\n[INFO] Dataset paths:")
print(f"  10% data: {data_10_percent}")
print(f"  20% data: {data_20_percent}")

## 6. Analyze Dataset Sizes

In [None]:
def count_images(path: Path) -> Tuple[Dict[str, int], int, int]:
    """Count images in train and test directories.
    
    Returns:
        Tuple of (class_counts, total_train, total_test)
    """
    class_counts = {}
    train_count = 0
    test_count = 0
    
    train_dir = path / "train"
    test_dir = path / "test"
    
    if train_dir.exists():
        for class_dir in train_dir.iterdir():
            if class_dir.is_dir():
                count = len(list(class_dir.glob("*.jpg")))
                class_counts[class_dir.name] = count
                train_count += count
    
    if test_dir.exists():
        for class_dir in test_dir.iterdir():
            if class_dir.is_dir():
                test_count += len(list(class_dir.glob("*.jpg")))
    
    return class_counts, train_count, test_count

# Analyze both datasets
print("="*50)
print("DATASET ANALYSIS")
print("="*50)

print("\n10% Dataset:")
class_counts_10, train_10, test_10 = count_images(data_10_percent)
for class_name, count in class_counts_10.items():
    print(f"  {class_name}: {count} images")
print(f"  Total train: {train_10}, Total test: {test_10}")

print("\n20% Dataset:")
class_counts_20, train_20, test_20 = count_images(data_20_percent)
for class_name, count in class_counts_20.items():
    print(f"  {class_name}: {count} images")
print(f"  Total train: {train_20}, Total test: {test_20}")

print(f"\nData Increase: {(train_20/train_10 - 1)*100:.0f}%")

## 7. Create DataLoaders

In [None]:
# Create transform for EfficientNet-B0
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
auto_transforms = weights.transforms()

print(f"Using transforms: {auto_transforms}")

# Create DataLoader for 10% dataset
train_dataloader_10, test_dataloader_10, class_names = data_setup.create_dataloaders(
    train_dir=data_10_percent / "train",
    test_dir=data_10_percent / "test",
    transform=auto_transforms,
    batch_size=32
)

# Create DataLoader for 20% dataset
# IMPORTANT: Use same test set for fair comparison!
train_dataloader_20, test_dataloader_20, _ = data_setup.create_dataloaders(
    train_dir=data_20_percent / "train",
    test_dir=data_10_percent / "test",  # Same test set!
    transform=auto_transforms,
    batch_size=32
)

print(f"\n[INFO] DataLoaders created:")
print(f"  10% data: {len(train_dataloader_10)} train batches")
print(f"  20% data: {len(train_dataloader_20)} train batches")
print(f"  Test data: {len(test_dataloader_10)} test batches (same for both)")
print(f"  Classes: {class_names}")

## 8. Model Creation Function

In [None]:
def create_effnetb0_model(num_classes: int = 3) -> nn.Module:
    """Creates an EfficientNet-B0 feature extractor model.
    
    Args:
        num_classes: Number of output classes
    
    Returns:
        EfficientNet-B0 model with frozen base layers
    """
    weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
    model = torchvision.models.efficientnet_b0(weights=weights)
    
    # Freeze the base layers
    for param in model.features.parameters():
        param.requires_grad = False
    
    # Update the classifier
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.2, inplace=True),
        nn.Linear(in_features=1280, out_features=num_classes)
    )
    
    return model.to(device)

# Test model creation
test_model = create_effnetb0_model()
print(f"Model created with {sum(p.numel() for p in test_model.parameters() if p.requires_grad):,} trainable parameters")
del test_model  # Clean up

## 9. TensorBoard Writer Function

In [None]:
def create_writer(experiment_name: str, 
                  model_name: str, 
                  extra: str = None) -> SummaryWriter:
    """Creates a SummaryWriter with organized directory structure.
    
    Args:
        experiment_name: Name of the experiment (e.g., "data_10_percent")
        model_name: Name of the model (e.g., "effnetb0")
        extra: Additional info (e.g., "5_epochs")
    
    Returns:
        SummaryWriter instance
    """
    timestamp = datetime.now().strftime("%Y-%m-%d")
    
    if extra:
        log_dir = os.path.join("runs", timestamp, experiment_name, model_name, extra)
    else:
        log_dir = os.path.join("runs", timestamp, experiment_name, model_name)
    
    print(f"[INFO] Created SummaryWriter saving to: {log_dir}")
    return SummaryWriter(log_dir=log_dir)

## 10. Training Function with Tracking

In [None]:
def train_and_track(model: nn.Module,
                    train_dataloader: torch.utils.data.DataLoader,
                    test_dataloader: torch.utils.data.DataLoader,
                    optimizer: torch.optim.Optimizer,
                    loss_fn: nn.Module,
                    writer: SummaryWriter,
                    epochs: int = 5,
                    device: str = "cpu") -> Dict[str, List[float]]:
    """Train model and track metrics with TensorBoard.
    
    Returns:
        Dictionary of metrics
    """
    results = {"train_loss": [], "train_acc": [], 
               "test_loss": [], "test_acc": []}
    
    for epoch in tqdm(range(epochs), desc="Training"):
        # Training
        train_loss, train_acc = engine.train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device
        )
        
        # Testing
        test_loss, test_acc = engine.test_step(
            model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn,
            device=device
        )
        
        # Print progress
        print(f"Epoch: {epoch+1} | "
              f"train_loss: {train_loss:.4f} | train_acc: {train_acc:.2f}% | "
              f"test_loss: {test_loss:.4f} | test_acc: {test_acc:.2f}%")
        
        # Store results
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)
        
        # Log to TensorBoard
        writer.add_scalars("Loss", 
                          {"train": train_loss, "test": test_loss},
                          epoch)
        writer.add_scalars("Accuracy",
                          {"train": train_acc, "test": test_acc},
                          epoch)
    
    return results

## 11. Experiment 1: Training with 10% Data

In [None]:
# Set seed for this experiment
set_seeds(42)

# Create fresh model for 10% data
model_10_percent = create_effnetb0_model()
loss_fn = nn.CrossEntropyLoss()
optimizer_10 = torch.optim.Adam(model_10_percent.parameters(), lr=0.001)

# Create writer for 10% experiment
writer_10 = create_writer(
    experiment_name="data_10_percent",
    model_name="effnetb0",
    extra="5_epochs"
)

# Train with 10% data
print("\n" + "="*50)
print("EXPERIMENT 1: Training with 10% Data (225 images)")
print("="*50)

epochs = 5
results_10 = train_and_track(
    model=model_10_percent,
    train_dataloader=train_dataloader_10,
    test_dataloader=test_dataloader_10,
    optimizer=optimizer_10,
    loss_fn=loss_fn,
    writer=writer_10,
    epochs=epochs,
    device=device
)

writer_10.close()

print(f"\n[RESULTS] 10% Data Final Performance:")
print(f"  Final Test Accuracy: {results_10['test_acc'][-1]:.2f}%")
print(f"  Best Test Accuracy: {max(results_10['test_acc']):.2f}%")

## 12. Experiment 2: Training with 20% Data

In [None]:
# Set seed for this experiment
set_seeds(42)

# Create fresh model for 20% data
model_20_percent = create_effnetb0_model()
optimizer_20 = torch.optim.Adam(model_20_percent.parameters(), lr=0.001)

# Create writer for 20% experiment
writer_20 = create_writer(
    experiment_name="data_20_percent",
    model_name="effnetb0",
    extra="5_epochs"
)

# Train with 20% data
print("\n" + "="*50)
print("EXPERIMENT 2: Training with 20% Data (450 images)")
print("="*50)

results_20 = train_and_track(
    model=model_20_percent,
    train_dataloader=train_dataloader_20,
    test_dataloader=test_dataloader_20,
    optimizer=optimizer_20,
    loss_fn=loss_fn,
    writer=writer_20,
    epochs=epochs,
    device=device
)

writer_20.close()

print(f"\n[RESULTS] 20% Data Final Performance:")
print(f"  Final Test Accuracy: {results_20['test_acc'][-1]:.2f}%")
print(f"  Best Test Accuracy: {max(results_20['test_acc']):.2f}%")

## 13. Direct Comparison

In [None]:
print("\n" + "="*50)
print("DIRECT COMPARISON")
print("="*50)

print("\nAccuracy Improvement Per Epoch:")
for epoch in range(epochs):
    improvement = results_20['test_acc'][epoch] - results_10['test_acc'][epoch]
    print(f"  Epoch {epoch+1}: "
          f"10% = {results_10['test_acc'][epoch]:.2f}% | "
          f"20% = {results_20['test_acc'][epoch]:.2f}% | "
          f"Diff = {improvement:+.2f}%")

# Calculate average improvement
avg_improvement = sum(results_20['test_acc']) / len(results_20['test_acc']) - \
                 sum(results_10['test_acc']) / len(results_10['test_acc'])
print(f"\nAverage Accuracy Improvement: {avg_improvement:.2f}%")

# Check overfitting
overfit_10 = results_10['train_acc'][-1] - results_10['test_acc'][-1]
overfit_20 = results_20['train_acc'][-1] - results_20['test_acc'][-1]

print(f"\nOverfitting Analysis:")
print(f"  10% data: Train-Test gap = {overfit_10:.2f}%")
print(f"  20% data: Train-Test gap = {overfit_20:.2f}%")
print(f"  Overfitting reduction with 20% data: {overfit_10 - overfit_20:.2f}%")

## 14. Visualization: Learning Curves

In [None]:
# Create figure with 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

epochs_range = range(1, epochs + 1)

# Plot 1: Test Accuracy Comparison
axes[0, 0].plot(epochs_range, results_10['test_acc'], label='10% Data', marker='o', linewidth=2)
axes[0, 0].plot(epochs_range, results_20['test_acc'], label='20% Data', marker='s', linewidth=2)
axes[0, 0].set_title('Test Accuracy: 10% vs 20% Data', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Accuracy (%)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Test Loss Comparison
axes[0, 1].plot(epochs_range, results_10['test_loss'], label='10% Data', marker='o', linewidth=2)
axes[0, 1].plot(epochs_range, results_20['test_loss'], label='20% Data', marker='s', linewidth=2)
axes[0, 1].set_title('Test Loss: 10% vs 20% Data', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Train-Test Gap (Overfitting)
train_test_gap_10 = [results_10['train_acc'][i] - results_10['test_acc'][i] 
                      for i in range(epochs)]
train_test_gap_20 = [results_20['train_acc'][i] - results_20['test_acc'][i] 
                      for i in range(epochs)]

axes[1, 0].plot(epochs_range, train_test_gap_10, label='10% Data', marker='o', linewidth=2)
axes[1, 0].plot(epochs_range, train_test_gap_20, label='20% Data', marker='s', linewidth=2)
axes[1, 0].set_title('Overfitting: Train-Test Accuracy Gap', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Gap (%)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)

# Plot 4: Improvement Analysis
improvements = [results_20['test_acc'][i] - results_10['test_acc'][i] 
                for i in range(epochs)]
axes[1, 1].bar(epochs_range, improvements, color=['green' if x > 0 else 'red' for x in improvements])
axes[1, 1].set_title('Accuracy Improvement: 20% vs 10% Data', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Improvement (%)')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].axhline(y=0, color='black', linestyle='-', alpha=0.5)

plt.tight_layout()
plt.savefig('data_scaling_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n[INFO] Comparison plots saved to 'data_scaling_comparison.png'")

## 15. Cost-Benefit Analysis

In [None]:
print("\n" + "="*50)
print("COST-BENEFIT ANALYSIS")
print("="*50)

# Performance metrics
final_acc_10 = results_10['test_acc'][-1]
final_acc_20 = results_20['test_acc'][-1]
best_acc_10 = max(results_10['test_acc'])
best_acc_20 = max(results_20['test_acc'])
acc_improvement_final = final_acc_20 - final_acc_10
acc_improvement_best = best_acc_20 - best_acc_10

# Data metrics
data_increase = (train_20 / train_10 - 1) * 100

# Training time (approximate based on batches)
batches_10 = len(train_dataloader_10) * epochs
batches_20 = len(train_dataloader_20) * epochs
time_increase = (batches_20 / batches_10 - 1) * 100

print(f"\nüìä Data Investment:")
print(f"  10% dataset: {train_10} images")
print(f"  20% dataset: {train_20} images")
print(f"  Increase: {data_increase:.0f}%")

print(f"\nüìà Performance Gain:")
print(f"  10% final accuracy: {final_acc_10:.2f}%")
print(f"  20% final accuracy: {final_acc_20:.2f}%")
print(f"  Final improvement: {acc_improvement_final:.2f}%")
print(f"  Best improvement: {acc_improvement_best:.2f}%")

print(f"\n‚è±Ô∏è Training Cost:")
print(f"  10% total batches: {batches_10}")
print(f"  20% total batches: {batches_20}")
print(f"  Time increase: {time_increase:.0f}%")

print(f"\nüí∞ Return on Investment:")
print(f"  {data_increase:.0f}% more data ‚Üí {acc_improvement_final:.2f}% accuracy gain")
print(f"  Efficiency: {acc_improvement_final / (data_increase/100):.2f}% gain per 100% data increase")

# Worth it analysis
if acc_improvement_final > 5:
    verdict = "‚úÖ Definitely worth it!"
elif acc_improvement_final > 2:
    verdict = "‚ö†Ô∏è Moderate benefit"
else:
    verdict = "‚ùå Minimal benefit"
print(f"\n  Verdict: {verdict}")

## 16. Statistical Analysis with Multiple Runs

In [None]:
def run_multiple_experiments(dataloader_train, dataloader_test, num_runs=3, epochs=5):
    """Run multiple experiments with different seeds for statistical analysis."""
    all_results = []
    
    for run in range(num_runs):
        # Set different seed for each run
        set_seeds(42 + run)
        
        # Create fresh model
        model = create_effnetb0_model()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        loss_fn = nn.CrossEntropyLoss()
        
        # Create dummy writer (we don't need to log these)
        dummy_writer = SummaryWriter(log_dir=f"runs/temp/run_{run}")
        
        # Train
        results = train_and_track(
            model=model,
            train_dataloader=dataloader_train,
            test_dataloader=dataloader_test,
            optimizer=optimizer,
            loss_fn=loss_fn,
            writer=dummy_writer,
            epochs=epochs,
            device=device
        )
        dummy_writer.close()
        
        all_results.append(results['test_acc'][-1])  # Final test accuracy
        
    return all_results

print("Running multiple experiments for statistical significance...")
print("This may take a few minutes...\n")

# Run 3 experiments for each data size
num_runs = 3
results_10_multi = run_multiple_experiments(train_dataloader_10, test_dataloader_10, num_runs, epochs=3)
results_20_multi = run_multiple_experiments(train_dataloader_20, test_dataloader_20, num_runs, epochs=3)

# Calculate statistics
mean_10 = np.mean(results_10_multi)
std_10 = np.std(results_10_multi)
mean_20 = np.mean(results_20_multi)
std_20 = np.std(results_20_multi)

print("\n" + "="*50)
print("STATISTICAL ANALYSIS")
print("="*50)
print(f"\n10% Data Results ({num_runs} runs):")
print(f"  Individual runs: {[f'{x:.2f}%' for x in results_10_multi]}")
print(f"  Mean ¬± Std: {mean_10:.2f}% ¬± {std_10:.2f}%")

print(f"\n20% Data Results ({num_runs} runs):")
print(f"  Individual runs: {[f'{x:.2f}%' for x in results_20_multi]}")
print(f"  Mean ¬± Std: {mean_20:.2f}% ¬± {std_20:.2f}%")

print(f"\nImprovement: {mean_20 - mean_10:.2f}% ¬± {np.sqrt(std_10**2 + std_20**2):.2f}%")

# Perform t-test
from scipy import stats
t_stat, p_value = stats.ttest_ind(results_20_multi, results_10_multi)
print(f"\nStatistical Test:")
print(f"  t-statistic: {t_stat:.3f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Significant? {'Yes' if p_value < 0.05 else 'No'} (Œ±=0.05)")

## 17. Learning Curve Extrapolation

In [None]:
# Create hypothetical learning curve
data_percentages = [5, 10, 20, 40, 60, 80, 100]
data_samples = [int(p * 2250 / 100) for p in data_percentages]  # Assuming full dataset is 2250 samples

# Hypothetical accuracies based on power law
# Using our two data points to estimate the curve
observed_acc_10 = results_10['test_acc'][-1]
observed_acc_20 = results_20['test_acc'][-1]

# Simple power law model: acc = a * (samples)^b + c
# We'll create a simplified extrapolation
hypothetical_accuracies = [
    75.0,  # 5%
    observed_acc_10,  # 10% (observed)
    observed_acc_20,  # 20% (observed)
    observed_acc_20 + 3,  # 40% (estimated)
    observed_acc_20 + 4,  # 60% (estimated)
    observed_acc_20 + 4.5,  # 80% (estimated)
    observed_acc_20 + 5,  # 100% (estimated)
]

# Plot learning curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Learning Curve
ax1.plot(data_samples[:3], hypothetical_accuracies[:3], 'o-', label='Observed', linewidth=2, markersize=8)
ax1.plot(data_samples[2:], hypothetical_accuracies[2:], 's--', label='Projected', linewidth=2, markersize=6, alpha=0.7)
ax1.set_xlabel('Number of Training Samples', fontsize=11)
ax1.set_ylabel('Test Accuracy (%)', fontsize=11)
ax1.set_title('Learning Curve: Data Size vs Performance', fontsize=12, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Annotate sweet spot
sweet_spot_idx = 2  # 20% seems to be good
ax1.annotate('Sweet Spot?', 
             xy=(data_samples[sweet_spot_idx], hypothetical_accuracies[sweet_spot_idx]),
             xytext=(data_samples[sweet_spot_idx]+100, hypothetical_accuracies[sweet_spot_idx]-2),
             arrowprops=dict(arrowstyle='->', color='red'),
             fontsize=10, color='red')

# Plot 2: Diminishing Returns
marginal_gains = [hypothetical_accuracies[i] - hypothetical_accuracies[i-1] 
                  for i in range(1, len(hypothetical_accuracies))]
ax2.bar(data_percentages[1:], marginal_gains, width=5, color='skyblue', edgecolor='navy')
ax2.set_xlabel('Data Percentage (%)', fontsize=11)
ax2.set_ylabel('Marginal Accuracy Gain (%)', fontsize=11)
ax2.set_title('Diminishing Returns Analysis', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')
ax2.axhline(y=2, color='red', linestyle='--', alpha=0.5, label='Min Useful Gain')
ax2.legend()

plt.tight_layout()
plt.savefig('learning_curve_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("[INFO] Learning curve analysis saved to 'learning_curve_analysis.png'")

## 18. Summary and Recommendations

In [None]:
print("\n" + "="*60)
print("FINAL SUMMARY AND RECOMMENDATIONS")
print("="*60)

# Create summary dataframe
summary_data = {
    'Metric': [
        'Training Samples',
        'Final Test Accuracy',
        'Best Test Accuracy',
        'Train-Test Gap',
        'Training Batches',
        'Relative Efficiency'
    ],
    '10% Data': [
        train_10,
        f"{results_10['test_acc'][-1]:.2f}%",
        f"{max(results_10['test_acc']):.2f}%",
        f"{overfit_10:.2f}%",
        len(train_dataloader_10) * epochs,
        '100% (baseline)'
    ],
    '20% Data': [
        train_20,
        f"{results_20['test_acc'][-1]:.2f}%",
        f"{max(results_20['test_acc']):.2f}%",
        f"{overfit_20:.2f}%",
        len(train_dataloader_20) * epochs,
        f"{(results_20['test_acc'][-1] / results_10['test_acc'][-1] - 1) * 100:.1f}% improvement"
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\nüìä Experiment Summary:")
print(summary_df.to_string(index=False))

print("\nüéØ Key Findings:")
print(f"  1. Doubling data (10% ‚Üí 20%) improved accuracy by {acc_improvement_final:.2f}%")
print(f"  2. Overfitting reduced by {overfit_10 - overfit_20:.2f}% with more data")
print(f"  3. Training time increased by {time_increase:.0f}%")
print(f"  4. Diminishing returns already visible (non-linear improvement)")

print("\nüí° Recommendations:")
if acc_improvement_final > 5:
    print("  ‚úÖ Continue collecting more data - significant gains observed")
    print("  ‚úÖ Consider collecting up to 40% for optimal performance")
elif acc_improvement_final > 2:
    print("  ‚ö†Ô∏è More data provides moderate benefit")
    print("  üí° Consider data augmentation as cost-effective alternative")
    print("  üí° Focus on data quality over quantity")
else:
    print("  ‚ùå Minimal benefit from more data")
    print("  üí° Focus on model architecture improvements")
    print("  üí° Implement better data augmentation strategies")

print("\nüöÄ Next Steps:")
print("  1. Test with data augmentation on 10% to match 20% performance")
print("  2. Try different model architectures (EfficientNet-B2)")
print("  3. Experiment with learning rate schedules")
print("  4. Consider active learning for selective data collection")

## 19. Bonus: Data Augmentation Experiment

Can we make 10% data perform like 20% using augmentation?

In [None]:
# Create augmented transform for 10% data
augmented_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                        std=[0.229, 0.224, 0.225])
])

# Create augmented dataloader
train_dataloader_10_aug, _, _ = data_setup.create_dataloaders(
    train_dir=data_10_percent / "train",
    test_dir=data_10_percent / "test",
    transform=augmented_transform,
    batch_size=32
)

# Train with augmentation
print("\n" + "="*50)
print("BONUS: 10% Data with Augmentation")
print("="*50)

set_seeds(42)
model_10_aug = create_effnetb0_model()
optimizer_10_aug = torch.optim.Adam(model_10_aug.parameters(), lr=0.001)

writer_10_aug = create_writer(
    experiment_name="data_10_percent_augmented",
    model_name="effnetb0",
    extra="5_epochs"
)

results_10_aug = train_and_track(
    model=model_10_aug,
    train_dataloader=train_dataloader_10_aug,
    test_dataloader=test_dataloader_10,
    optimizer=optimizer_10_aug,
    loss_fn=loss_fn,
    writer=writer_10_aug,
    epochs=5,
    device=device
)

writer_10_aug.close()

print("\nüìä Augmentation Results:")
print(f"  10% Original: {results_10['test_acc'][-1]:.2f}%")
print(f"  10% Augmented: {results_10_aug['test_acc'][-1]:.2f}%")
print(f"  20% Original: {results_20['test_acc'][-1]:.2f}%")
print(f"\n  Augmentation improvement: {results_10_aug['test_acc'][-1] - results_10['test_acc'][-1]:.2f}%")
print(f"  Gap to 20% data: {results_20['test_acc'][-1] - results_10_aug['test_acc'][-1]:.2f}%")

if results_10_aug['test_acc'][-1] >= results_20['test_acc'][-1] - 2:
    print("\n  ‚úÖ Augmentation successfully closes the gap!")
else:
    print("\n  ‚ö†Ô∏è Augmentation helps but doesn't fully match 20% data performance")

## 20. Conclusion

### What We Learned

1. **Data scaling follows diminishing returns** - The first data is most valuable
2. **More data reduces overfitting** - Larger datasets improve generalization
3. **Cost-benefit analysis is crucial** - Always measure ROI of data collection
4. **Augmentation can help** - Virtual data can partially substitute for real data
5. **Statistical rigor matters** - Multiple runs provide confidence in results

### Practical Takeaways

- Start with a small dataset for prototyping
- Incrementally add data while monitoring improvements
- Use augmentation before collecting more data
- Find your domain's "sweet spot" empirically
- Track everything with TensorBoard for informed decisions

### Next Steps

In Lab 03, we'll run a full factorial experiment comparing:
- Multiple model architectures (EfficientNet-B0 vs B2)
- Multiple data sizes (10% vs 20%)
- Multiple training durations (5 vs 10 epochs)

This will give us a complete picture of what factors matter most for performance!