# 14. Data Size vs Accuracy

This notebook explores the relationship between dataset size and model accuracy.

## Experiment Overview
- **Goal**: Analyze how dataset size affects model performance
- **Model**: MLP trained on different data subsets
- **Features**: Learning curves, data efficiency analysis, scaling laws
- **Learning**: Understanding data requirements for neural networks

## What You'll Learn
- Data efficiency in neural networks
- Learning curves and scaling laws
- Sample complexity analysis
- Data augmentation effects


In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
from torch.utils.data import Subset, DataLoader

# Add scripts directory to path
sys.path.append('../scripts')
from utils import load_mnist_data, get_device, set_seed

# Set random seed for reproducibility
set_seed(42)

# Get device
device = get_device()
print(f"Using device: {device}")

# Load MNIST dataset
print("Loading MNIST dataset...")
train_loader, val_loader, test_loader = load_mnist_data(batch_size=64, test_split=0.2)

print(f"Training samples: {len(train_loader.dataset)}")
print(f"Validation samples: {len(val_loader.dataset)}")
print(f"Test samples: {len(test_loader.dataset)}")


In [None]:
# Define MLP model
class DataSizeMLP(nn.Module):
    def __init__(self, input_size=784, hidden_size=128, num_classes=10):
        super(DataSizeMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Training function
def train_model(model, train_loader, val_loader, epochs=20, lr=0.001):
    """Train model and return final accuracy."""
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        # Training
        model.train()
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                pred = output.argmax(dim=1)
                correct += pred.eq(target).sum().item()
                total += target.size(0)
        
        if (epoch + 1) % 10 == 0:
            accuracy = 100. * correct / total
            print(f'Epoch {epoch+1}/{epochs}, Val Accuracy: {accuracy:.2f}%')
    
    # Final accuracy
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
            total += target.size(0)
    
    return 100. * correct / total

# Test different data sizes
data_sizes = [100, 500, 1000, 2000, 5000, 10000, 20000, 40000]
accuracies = []

print("Testing different data sizes...")
for size in data_sizes:
    print(f"\nTraining with {size} samples...")
    
    # Create subset
    indices = np.random.choice(len(train_loader.dataset), size, replace=False)
    subset = Subset(train_loader.dataset, indices)
    subset_loader = DataLoader(subset, batch_size=64, shuffle=True)
    
    # Train model
    model = DataSizeMLP().to(device)
    accuracy = train_model(model, subset_loader, val_loader, epochs=20)
    accuracies.append(accuracy)
    
    print(f"Final accuracy with {size} samples: {accuracy:.2f}%")

# Plot results
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(data_sizes, accuracies, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Training Samples')
plt.ylabel('Validation Accuracy (%)')
plt.title('Accuracy vs Data Size')
plt.grid(True)

plt.subplot(2, 2, 2)
plt.plot(np.log10(data_sizes), accuracies, 'ro-', linewidth=2, markersize=8)
plt.xlabel('Log10(Training Samples)')
plt.ylabel('Validation Accuracy (%)')
plt.title('Accuracy vs Log Data Size')
plt.grid(True)

plt.subplot(2, 2, 3)
# Power law fit
log_sizes = np.log10(data_sizes)
log_accuracies = np.log10(accuracies)
z = np.polyfit(log_sizes, log_accuracies, 1)
p = np.poly1d(z)
plt.plot(log_sizes, log_accuracies, 'go-', linewidth=2, markersize=8, label='Data')
plt.plot(log_sizes, p(log_sizes), 'r--', linewidth=2, label=f'Fit: y={z[0]:.2f}x+{z[1]:.2f}')
plt.xlabel('Log10(Training Samples)')
plt.ylabel('Log10(Accuracy)')
plt.title('Power Law Fit')
plt.legend()
plt.grid(True)

plt.subplot(2, 2, 4)
# Efficiency analysis
efficiency = [acc / size for acc, size in zip(accuracies, data_sizes)]
plt.plot(data_sizes, efficiency, 'mo-', linewidth=2, markersize=8)
plt.xlabel('Training Samples')
plt.ylabel('Accuracy per Sample')
plt.title('Data Efficiency')
plt.grid(True)

plt.tight_layout()
plt.savefig('../results/plots/data_size_vs_accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

# Print summary
print("\nData Size vs Accuracy Summary:")
for size, acc in zip(data_sizes, accuracies):
    print(f"{size:5d} samples: {acc:6.2f}% accuracy")

print(f"\nPower law coefficient: {z[0]:.3f}")
print(f"R-squared: {np.corrcoef(log_sizes, log_accuracies)[0,1]**2:.3f}")
