In [None]:
!pip install -q seaborn tqdm


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Subset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Data Preparation
def load_mnist():
    transform = transforms.Compose([transforms.ToTensor()])
    train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    return train_dataset

def create_unique_subsets(train_dataset, data_sizes, num_repeats):
    """
    Creates unique subsets for each data_size and repeat.
    Ensures no overlap between subsets across all repeats and data_sizes.
    Returns a dictionary with keys as (data_size) and values as lists of (data, targets) tuples.
    All data is stored as tensors on the GPU.
    """
    total_required = sum(data_size * num_repeats for data_size in data_sizes)
    if total_required > len(train_dataset):
        raise ValueError("Not enough data to create unique subsets without overlap.")
    
    indices = list(range(len(train_dataset)))
    random.shuffle(indices)
    
    subsets = {data_size: [] for data_size in data_sizes}
    
    current_idx = 0
    for data_size in tqdm(data_sizes, desc="Creating subsets"):
        for repeat in range(num_repeats):
            subset_indices = indices[current_idx : current_idx + data_size]
            current_idx += data_size
            # Extract and flatten the images, move to GPU
            subset_data = torch.stack([train_dataset[i][0].view(-1) for i in subset_indices]).to(device)
            subset_targets = torch.tensor([train_dataset[i][1] for i in subset_indices], dtype=torch.long).to(device)
            subsets[data_size].append((subset_data, subset_targets))
    
    return subsets

# 2. Model Definition
class SimpleNN(nn.Module):
    def __init__(self, input_size=784, hidden_size=128, num_layers=1, output_size=10):
        super(SimpleNN, self).__init__()
        layers = []
        current_size = input_size
        for _ in range(num_layers):
            layers.append(nn.Linear(current_size, hidden_size))
            layers.append(nn.ReLU())
            current_size = hidden_size
        layers.append(nn.Linear(current_size, output_size))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# 3. Training and Evaluation
def train_until_memorization(model, data, targets, criterion, optimizer, 
                             tolerance=1e-4, patience=10, delta=1e-5, max_epochs=1000):
    """
    Trains the model until it perfectly memorizes the data or the loss stops improving.
    Implements early stopping with patience and delta.
    Returns True if memorization is achieved, else False.
    """
    best_loss = float('inf')
    epochs_no_improve = 0
    
    for epoch in range(max_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        current_loss = loss.item()
        
        # Check for improvement
        if best_loss - current_loss > delta:
            best_loss = current_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        
        # Check for memorization
        if current_loss < tolerance:
            return True
        
        # Check for early stopping
        if epochs_no_improve >= patience:
            return False
    
    return False

def evaluate_memorization(model, data, targets):
    """
    Evaluates whether the model has perfectly memorized the data.
    Returns True if all predictions are correct, else False.
    """
    model.eval()
    with torch.no_grad():
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
        return torch.all(predicted == targets).item()

# 4. Experiment Setup
def run_experiment(subsets, data_sizes, model_layers, num_repeats=10):
    """
    Runs the experiment across different data sizes and model layers.
    Returns a results dictionary.
    """
    results = {}
    for data_size in tqdm(data_sizes, desc="Running experiments"):
        for repeat in range(num_repeats):
            # Get the subset data and targets
            subset_data, subset_targets = subsets[data_size][repeat]
            
            for num_layers in model_layers:
                # Initialize model
                model = SimpleNN(num_layers=num_layers).to(device)
                
                # Define loss and optimizer
                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=0.01)
                
                # Train
                memorized = train_until_memorization(
                    model, subset_data, subset_targets, 
                    criterion, optimizer, 
                    tolerance=1e-4, patience=10, delta=1e-5, max_epochs=1000
                )
                
                # If not memorized by training, evaluate
                if not memorized:
                    memorized = evaluate_memorization(model, subset_data, subset_targets)
                
                # Record the result
                key = (data_size, num_layers)
                if key not in results:
                    results[key] = []
                results[key].append(memorized)
                
                # Print summary
                print(f"Data size: {data_size}, Layers: {num_layers}, Repeat: {repeat+1}, Memorized: {memorized}")
    return results

# 5. Visualization
def visualize_results(results, data_sizes, model_layers, num_repeats=10):
    """
    Creates a heatmap to visualize the relationship between model size and dataset size.
    """
    # Prepare data for heatmap
    heatmap_data = np.zeros((len(model_layers), len(data_sizes)))
    
    for i, num_layers in enumerate(model_layers):
        for j, data_size in enumerate(data_sizes):
            key = (data_size, num_layers)
            # Calculate the proportion of repeats where memorization was successful
            success_rate = sum(results.get(key, [])) / num_repeats
            heatmap_data[i, j] = success_rate * 100  # Percentage
    
    # Plot heatmap
    plt.figure(figsize=(25, 12))
    sns.heatmap(heatmap_data, annot=False, fmt=".1f", xticklabels=data_sizes, yticklabels=model_layers, cmap="YlGnBu")
    plt.xlabel("Dataset Size (Number of Examples)", fontsize=14)
    plt.ylabel("Model Size (Number of Layers)", fontsize=14)
    plt.title("Memorization Success Rate (%)", fontsize=16)
    plt.tight_layout()
    plt.show()

# 6. Main Execution
if __name__ == "__main__":
    # Parameters
    data_sizes = list(range(1, 101))  # 1-100 examples
    model_layers = list(range(1, 11))  # 1-10 layers
    num_repeats = 10  # 10 repeats per combination
    
    # Load data
    train_dataset = load_mnist()
    
    # Pre-generate all subsets and load onto GPU
    print("Pre-generating all subsets and loading onto GPU...")
    subsets = create_unique_subsets(train_dataset, data_sizes, num_repeats)
    print("All subsets are loaded onto GPU.")
    
    # Run experiment
    results = run_experiment(subsets, data_sizes, model_layers, num_repeats=num_repeats)
    
    # Visualize results
    visualize_results(results, data_sizes, model_layers, num_repeats=num_repeats)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Subset, DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random
import warnings

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Data Preparation
def load_fashion_mnist():
    transform = transforms.Compose([transforms.ToTensor()])
    train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
    return train_dataset

def create_held_out_set(train_dataset, held_out_size=500):
    """
    Creates a held-out test set that is never used in training.
    Returns the held-out data and targets as tensors on the GPU.
    """
    total_train = len(train_dataset)
    if held_out_size > total_train:
        raise ValueError("Held-out size exceeds the total training data size.")
    
    indices = list(range(total_train))
    random.shuffle(indices)
    
    held_out_indices = indices[:held_out_size]
    held_out_data = torch.stack([train_dataset[i][0].view(-1) for i in held_out_indices]).to(device)
    held_out_targets = torch.tensor([train_dataset[i][1] for i in held_out_indices], dtype=torch.long).to(device)
    
    return held_out_data, held_out_targets

def sample_subset(train_dataset, data_size):
    """
    Randomly samples a subset of the specified size from the training dataset.
    Returns the subset data and targets as tensors on the GPU.
    """
    total_train = len(train_dataset)
    if data_size > total_train:
        warnings.warn(f"Requested data size ({data_size}) exceeds the available training data size ({total_train}). Capping to {total_train}.")
        data_size = total_train
    
    indices = random.sample(range(total_train), data_size)
    subset_data = torch.stack([train_dataset[i][0].view(-1) for i in indices]).to(device)
    subset_targets = torch.tensor([train_dataset[i][1] for i in indices], dtype=torch.long).to(device)
    
    return subset_data, subset_targets

# 2. Model Definition
class SimpleNN(nn.Module):
    def __init__(self, input_size=784, hidden_size=128, num_layers=1, output_size=10):
        super(SimpleNN, self).__init__()
        layers = []
        current_size = input_size
        for _ in range(num_layers):
            layers.append(nn.Linear(current_size, hidden_size))
            layers.append(nn.ReLU())
            current_size = hidden_size
        layers.append(nn.Linear(current_size, output_size))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# 3. Training and Evaluation
def train_until_memorization(model, data, targets, criterion, optimizer, 
                             tolerance=1e-4, patience=10, delta=1e-5, max_epochs=1000):
    """
    Trains the model until it perfectly memorizes the data or the loss stops improving.
    Implements early stopping with patience and delta.
    Returns True if memorization is achieved, else False, along with final loss.
    """
    best_loss = float('inf')
    epochs_no_improve = 0
    
    for epoch in range(max_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        current_loss = loss.item()
        
        # Check for improvement
        if best_loss - current_loss > delta:
            best_loss = current_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
        
        # Check for memorization
        if current_loss < tolerance:
            return True, current_loss
        
        # Check for early stopping
        if epochs_no_improve >= patience:
            return False, current_loss
    
    return False, current_loss

def evaluate_memorization(model, data, targets):
    """
    Evaluates whether the model has perfectly memorized the data.
    Returns True if all predictions are correct, else False, along with accuracy.
    """
    model.eval()
    with torch.no_grad():
        outputs = model(data)
        _, predicted = torch.max(outputs, 1)
        correct = (predicted == targets).sum().item()
        total = targets.size(0)
        accuracy = correct / total
    return accuracy == 1.0, accuracy

def evaluate_test_set(model, test_data, test_targets):
    """
    Evaluates the model's accuracy on the held-out test set.
    Returns the test accuracy.
    """
    model.eval()
    with torch.no_grad():
        outputs = model(test_data)
        _, predicted = torch.max(outputs, 1)
        correct = (predicted == test_targets).sum().item()
        total = test_targets.size(0)
        accuracy = correct / total
    return accuracy

# 4. Experiment Setup
def run_experiment(train_dataset, held_out_data, held_out_targets, data_sizes, model_layers, 
                   step, max_size, tolerance=1e-4, patience=10, delta=1e-5, max_epochs=1000):
    """
    Runs the experiment across different data sizes and model layers.
    Returns a results dictionary with structure:
    results[model_layers][data_size] = (final_loss, training_accuracy, test_accuracy)
    """
    results = {layers: {} for layers in model_layers}
    
    # Define DataLoader for held-out test set (not used here since we directly use tensors)
    # test_loader = DataLoader(TensorDataset(held_out_data, held_out_targets), batch_size=len(held_out_data))
    
    for data_size in tqdm([1] + list(range(100, max_size + 1, step)), desc="Data Sizes"):
        for num_layers in model_layers:
            # Sample a random subset
            subset_data, subset_targets = sample_subset(train_dataset, data_size)
            
            # Initialize model
            model = SimpleNN(num_layers=num_layers).to(device)
            
            # Define loss and optimizer
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.01)
            
            # Train
            memorized, final_loss = train_until_memorization(
                model, subset_data, subset_targets, 
                criterion, optimizer, 
                tolerance=tolerance, patience=patience, delta=delta, max_epochs=max_epochs
            )
            
            # If not memorized by training, evaluate
            if not memorized:
                memorized, training_accuracy = evaluate_memorization(model, subset_data, subset_targets)
                if memorized:
                    training_accuracy = 1.0
                else:
                    training_accuracy = training_accuracy
            else:
                training_accuracy = 1.0  # Memorization achieved
            
            # Evaluate on held-out test set
            test_accuracy = evaluate_test_set(model, held_out_data, held_out_targets)
            
            # Record the result
            results[num_layers][data_size] = (final_loss, training_accuracy, test_accuracy)
            
            # Print summary
            print(f"Layers: {num_layers}, Data size: {data_size}, Memorized: {memorized}, "
                  f"Final Loss: {final_loss:.6f}, Training Acc: {training_accuracy*100:.2f}%, "
                  f"Test Acc: {test_accuracy*100:.2f}%")
    
    return results

# 5. Visualization
def visualize_results(results, data_sizes, model_layers, step, max_size):
    """
    Creates two plots (1-layer and 2-layer) showing loss, training accuracy, and test accuracy vs. data size.
    """
    for num_layers in model_layers:
        losses = []
        training_accuracies = []
        test_accuracies = []
        sizes = []
        for data_size in [1] + list(range(100, max_size + 1, step)):
            loss, train_acc, test_acc = results[num_layers][data_size]
            sizes.append(data_size)
            losses.append(loss)
            training_accuracies.append(train_acc * 100)  # Convert to percentage
            test_accuracies.append(test_acc * 100)        # Convert to percentage
        
        fig, ax1 = plt.subplots(figsize=(12, 6))
        
        color = 'tab:blue'
        ax1.set_xlabel('Dataset Size (Number of Examples)', fontsize=12)
        ax1.set_ylabel('Final Loss', color=color, fontsize=12)
        ax1.plot(sizes, losses, marker='o', color=color, label='Final Loss')
        ax1.tick_params(axis='y', labelcolor=color)
        ax1.set_title(f'Memorization Performance for {num_layers}-Layer Model', fontsize=14)
        
        ax2 = ax1.twinx()  # Instantiate a second axes that shares the same x-axis
        
        # Plot Training and Test Accuracy
        color = 'tab:red'
        ax2.set_ylabel('Accuracy (%)', color=color, fontsize=12)  # we already handled the x-label with ax1
        ax2.plot(sizes, training_accuracies, marker='x', color='tab:green', label='Training Accuracy')
        ax2.plot(sizes, test_accuracies, marker='s', color=color, label='Test Accuracy')
        ax2.tick_params(axis='y', labelcolor=color)
        
        # Legends
        lines_1, labels_1 = ax1.get_legend_handles_labels()
        lines_2, labels_2 = ax2.get_legend_handles_labels()
        ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper right')
        
        fig.tight_layout()  # Otherwise the right y-label is slightly clipped
        plt.show()

# 6. Main Execution
if __name__ == "__main__":
    # Parameters
    model_layers = [1, 2]  # 1-2 layers
    data_sizes = [1] + list(range(100, 1001, 100))  # 1, 100, 200, ..., 1000
    step = 5000  # Jump size for data sizes
    max_size = 1000000  # Maximum data size
    held_out_size = 5000  # Held-out test set size
    tolerance = 1e-4  # Loss tolerance for memorization
    patience = 10  # Early stopping patience
    delta = 1e-5  # Minimum loss improvement
    max_epochs = 1000  # Maximum number of epochs
    
    # Load data
    print("Loading Fashion-MNIST dataset...")
    train_dataset = load_fashion_mnist()
    
    # Create held-out test set
    print("Creating held-out test set...")
    held_out_data, held_out_targets = create_held_out_set(train_dataset, held_out_size=held_out_size)
    print(f"Held-out test set created with {held_out_size} examples.\n")
    
    # Run experiment
    results = run_experiment(
        train_dataset, 
        held_out_data, 
        held_out_targets, 
        data_sizes, 
        model_layers, 
        step, 
        max_size, 
        tolerance=tolerance, 
        patience=patience, 
        delta=delta, 
        max_epochs=max_epochs
    )
    
    # Visualization
    visualize_results(results, data_sizes, model_layers, step, max_size)
