In [None]:
# Uncomment this code if you are running on Google Colab
# ! pip install https://github.com/rcpaffenroth/generatedata
# ! pip install https://github.com/rcpaffenroth/iterativennsimple

<a href="https://colab.research.google.com/github/rcpaffenroth/iterativennsimple/blob/main/notebooks/6-rcp-Sequential-vs-Sequential2D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequential vs Sequential2D Comparison

This notebook compares PyTorch's built-in Sequential container with our custom Sequential2D container, demonstrating:

1. **Functional Equivalence**: Cases where both produce identical results
2. **Architectural Differences**: Unique capabilities of Sequential2D

## Learning Objectives

- Understand the differences between linear sequential architectures and 2D block architectures
- Learn when Sequential2D provides advantages over standard Sequential
- Analyze performance trade-offs in different scenarios
- Gain insights into advanced neural network architecture design

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import numpy as np
from iterativennsimple.Sequential2D import Sequential2D
from iterativennsimple.Sequential1D import Sequential1D

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")  # Force CPU for this example
print(f"Using device: {device}")

Using device: cuda


## Part 1: Functional Equivalence

Let's start by showing how Sequential2D can replicate the behavior of PyTorch's Sequential container.

### Understanding the Key Difference

- **Sequential**: A linear chain where data flows through layers one by one
- **Sequential2D**: A 2D block matrix where data can flow through multiple parallel paths

The magic is that when Sequential2D is configured properly, it can be mathematically equivalent to Sequential!

In [2]:
def create_equivalent_networks():
    """
    Create functionally equivalent networks using Sequential and Sequential2D
    
    This function demonstrates how to build the same 3-layer neural network
    using both PyTorch's Sequential and our custom Sequential2D approach.
    """
    # Network dimensions - typical sizes for MNIST-like problems
    input_size = 784  # Flattened 28x28 image
    hidden_size = 128 # Hidden layer size
    output_size = 10  # Number of classes
    
    # Create the three linear layers that both networks will use
    # This is where the linear algebra happens in our neural network 
    # and dynamical systems.
    f1 = nn.Linear(input_size, hidden_size)   # First layer: 784 -> 128
    f2 = nn.Linear(hidden_size, hidden_size)  # Second layer: 128 -> 128  
    f3 = nn.Linear(hidden_size, output_size)  # Output layer: 128 -> 10

    # This is the standard way: layers are applied one after another
    # Data flow: input -> f1 -> ReLU -> f2 -> ReLU -> f3 -> output
    sequential_net = nn.Sequential(
        f1,         # Apply first linear transformation
        nn.ReLU(),  # Apply activation function
        f2,         # Apply second linear transformation
        nn.ReLU(),  # Apply activation function
        f3          # Apply final linear transformation (no activation)
    )

    # Sequential2D thinks of the network as a 2D block matrix that is iterated
    
    # Define the dimensions at each step
    in_features_list = [input_size, hidden_size, hidden_size, output_size]
    out_features_list = [input_size, hidden_size, hidden_size, output_size]
        
    # Sequential1D blocks: wrap regular PyTorch layers for use in Sequential2D
    # Note: Sequential1D really just a wrapper for nn.Sequential that provides
    # the size information needed by Sequential2D.
    # Note: ReLU activations are placed BEFORE the linear layers in Sequential2D
    # but they could just as well be afterward.
    # This is  what we use
    F1 = Sequential1D(nn.Sequential(f1),            # Just f1, no activation
                      in_features=input_size,  out_features=hidden_size)
    F2 = Sequential1D(nn.Sequential(nn.ReLU(), f2), # ReLU then f2
                      in_features=hidden_size, out_features=hidden_size)
    F3 = Sequential1D(nn.Sequential(nn.ReLU(), f3), # ReLU then f3
                      in_features=hidden_size, out_features=output_size)
    # But, you could also do it the other way around:
    # F1 = Sequential1D(nn.Sequential(f1, nn.ReLU()),  # f1 *then* ReLU, the order matters!
    #                   in_features=input_size, out_features=hidden_size)
    # F2 = Sequential1D(nn.Sequential(f2, nn.ReLU()),  # f2 *then* ReLU
    #                   in_features=hidden_size, out_features=hidden_size)
    # F3 = Sequential1D(nn.Sequential(f3),             # No activation here!
    #                   in_features=hidden_size, out_features=output_size)

    # This matrix defines how data flows through the dynamical system
    # NOTE: The structure istransposed compared to typical matrix notation! That is what Pytorch expects,
    # since deep learning used *left* dot products. See https://docs.pytorch.org/docs/stable/generated/torch.nn.Linear.html
    #
    # Layer 0:   [None, F1,   None, None ]  <- Input layer
    # Layer 1:   [None, None, F2,   None ]  <- Hidden layer  
    # Layer 2:   [None, None, None, F3   ]  <- Output layer
    # Layer 3:   [None, None, None, None ]  <- (unused, but makes operaor square, which is required)
    # Note: in Sequential2D, a None block is "free" and does not perform any operation.,
    blocks = [[None,   F1, None, None],
              [None, None, F2,   None],
              [None, None, None, F3],
              [None, None, None, None]]
    
    # Calculate expected number of parameters for verification
    W_parameters = input_size * hidden_size + hidden_size * hidden_size + hidden_size * output_size 
    b_parameters = hidden_size + hidden_size + output_size
    print(f"Theoretical number of parameters: {W_parameters + b_parameters}")
    
    # Create the Sequential2D network
    sequential2d_net = Sequential2D(in_features_list, out_features_list, blocks)    
    return sequential_net, sequential2d_net

# Create equivalent networks
seq_net, seq2d_net = create_equivalent_networks()

print("Network architectures created successfully!")
print(f"Sequential parameters: {sum(p.numel() for p in seq_net.parameters())}")
print(f"Sequential2D parameters: {sum(p.numel() for p in seq2d_net.parameters())}")

Theoretical number of parameters: 118282
Network architectures created successfully!
Sequential parameters: 118282
Sequential2D parameters: 118282


In [3]:
def test_equivalence():
    """
    Test that both networks produce identical outputs
    
    This function demonstrates that Sequential and Sequential2D can produce
    exactly the same results when configured properly. This is important
    because it validates our Sequential2D implementation.
    """
    # Create test input - random data simulating a batch of flattened images
    batch_size = 32
    input_size = 784
    test_input = torch.randn(batch_size, input_size)
    
    # Get outputs from both networks
    with torch.no_grad():  # No gradients needed for testing
        # ===== Sequential1D neural network =====
        # Just call the network directly, since the layers are internal
        seq_output = seq_net(test_input)

        # ===== Sequential2D dynamical system =====
        # Requires iterative application
        # Start with input in first position, None elsewhere
        seq2d_output = [test_input, None, None, None]

        # The key insight: iterate the Sequential2D function multiple times
        # Each iteration applies one "time step" of the block matrix
        # This is equivalent to applying layers sequentially!
        for i in range(3):  # 3 iterations for the equivalent of 3 "layers"
            seq2d_output = seq2d_net(seq2d_output)
            
    # ===== VERIFICATION =====
    # Check if outputs are identical (within numerical precision)
    # The "output" of Sequential2D is a list - we want the final element
    max_diff = torch.max(torch.abs(seq_output - seq2d_output[3])).item()
    
    print(f"Maximum difference between outputs: {max_diff:.2e}")
    print(f"Outputs are {'identical' if max_diff < 1e-6 else 'different'}")
    
    if max_diff < 1e-6:
        print("✓ SUCCESS: Both networks produce the same results!")
    else:
        print("✗ WARNING: Networks produce different results!")
    
    return seq_output, seq2d_output

seq_output, seq2d_output = test_equivalence()

Maximum difference between outputs: 0.00e+00
Outputs are identical
✓ SUCCESS: Both networks produce the same results!


## Part 2: Training Equivalence

Now we'll test something more advanced: **do both networks train the same way?**

This is a crucial test because:
- We've shown they produce the same outputs (forward pass equivalence)
- But do they also learn the same way during training? (backward pass equivalence)

### Key Concept: Independent but Identical Networks

To test training equivalence properly, we need to:
1. Create two networks with **identical starting weights**
2. Keep them **independent** (training one doesn't affect the other)
3. Train them on the **same data** with the **same optimizer settings**
4. Verify they learn identically

This tests whether Sequential2D's gradient computation matches Sequential's.

In [None]:
def test_training_equivalence():
    """
    Test that Sequential and Sequential2D train identically using the same 3-layer model
    
    This is the most important test! It verifies that Sequential2D not only
    produces the same outputs, but also computes gradients and updates weights
    in exactly the same way as Sequential during training.
    """
    print("Testing training equivalence with 3-layer networks...")
    
    # Use the same architecture as our equivalence test
    input_size = 784
    hidden_size = 128
    output_size = 10
    
    # ===== STEP 1: Create base transformations that both will use =====
    # These will provide the initial random weights that both networks will copy
    f1_base = nn.Linear(input_size, hidden_size)   # Will be randomly initialized
    f2_base = nn.Linear(hidden_size, hidden_size)  # Will be randomly initialized
    f3_base = nn.Linear(hidden_size, output_size)  # Will be randomly initialized
    
    # ===== STEP 2: Create Sequential neural network =====
    # Make new layers and copy the base weights to them
    f1_seq = nn.Linear(input_size, hidden_size)
    f2_seq = nn.Linear(hidden_size, hidden_size)
    f3_seq = nn.Linear(hidden_size, output_size)
    
    # Critical: Copy weights and biases exactly
    f1_seq.weight.data.copy_(f1_base.weight.data)  # Copy weight matrix
    f1_seq.bias.data.copy_(f1_base.bias.data)      # Copy bias vector
    f2_seq.weight.data.copy_(f2_base.weight.data)
    f2_seq.bias.data.copy_(f2_base.bias.data)
    f3_seq.weight.data.copy_(f3_base.weight.data)
    f3_seq.bias.data.copy_(f3_base.bias.data)
    
    # Build the Sequential network
    seq_net = nn.Sequential(f1_seq, nn.ReLU(), f2_seq, nn.ReLU(), f3_seq)

    # ===== STEP 3: Create Sequential2D dynamical system =====
    # Make another set of new layers and copy the SAME base weights
    f1_seq2d = nn.Linear(input_size, hidden_size)
    f2_seq2d = nn.Linear(hidden_size, hidden_size)
    f3_seq2d = nn.Linear(hidden_size, output_size)
    
    # Copy the exact same initial weights (identical starting point!)
    f1_seq2d.weight.data.copy_(f1_base.weight.data)
    f1_seq2d.bias.data.copy_(f1_base.bias.data)
    f2_seq2d.weight.data.copy_(f2_base.weight.data)
    f2_seq2d.bias.data.copy_(f2_base.bias.data)
    f3_seq2d.weight.data.copy_(f3_base.weight.data)
    f3_seq2d.bias.data.copy_(f3_base.bias.data)
    
    # Build Sequential2D using the same block matrix structure as before
    in_features_list = [input_size, hidden_size, hidden_size, output_size]
    out_features_list = [input_size, hidden_size, hidden_size, output_size]
    
    F1 = Sequential1D(nn.Sequential(f1_seq2d), in_features=input_size, out_features=hidden_size)
    F2 = Sequential1D(nn.Sequential(nn.ReLU(), f2_seq2d), in_features=hidden_size, out_features=hidden_size)
    F3 = Sequential1D(nn.Sequential(nn.ReLU(), f3_seq2d), in_features=hidden_size, out_features=output_size)
    
    blocks = [[None, F1,   None, None],
              [None, None, F2,   None],
              [None, None, None, F3],
              [None, None, None, None]]
    
    seq2d_net = Sequential2D(in_features_list, out_features_list, blocks)
    
    print(f"Networks created with {sum(p.numel() for p in seq_net.parameters())} parameters each")

    # ===== STEP 4: Prepare training data =====
    batch_size = 32
    x = torch.randn(batch_size, input_size)      # Random input data
    target = torch.randn(batch_size, output_size) # Random target data
    
    print("\n1. Before training - checking outputs are identical:")
    with torch.no_grad():
        # Sequential forward pass
        out_seq = seq_net(x)
        
        # Sequential2D forward pass (using the iterative method)
        seq2d_input = [x, None, None, None]
        for i in range(3):  # 3 iterations for 3 effective layers
            seq2d_input = seq2d_net(seq2d_input)
        out_seq2d = seq2d_input[3]  # Extract final output
        
        diff = torch.max(torch.abs(out_seq - out_seq2d))
        print(f"   Output difference: {diff:.2e} (should be ~0)")
    
    print("\n2. Training both networks for one step:")
    criterion = nn.MSELoss()  # Mean squared error loss
    
    # ===== Train Sequential neural network =====
    optimizer_seq = torch.optim.SGD(seq_net.parameters(), lr=0.01)
    optimizer_seq.zero_grad()          # Clear any existing gradients
    loss_seq = criterion(seq_net(x), target)  # Compute loss
    loss_seq.backward()                # Compute gradients via backpropagation
    optimizer_seq.step()               # Update weights
    print(f"   Sequential loss: {loss_seq.item():.4f}")

    # ===== Train Sequential2D network =====
    optimizer_seq2d = torch.optim.SGD(seq2d_net.parameters(), lr=0.01)
    optimizer_seq2d.zero_grad()        # Clear any existing gradients
    # Forward pass through Sequential2D
    seq2d_input = [x, None, None, None]
    # Note, the 3 iterations are important!
    # It makes the dynamical system equivalent to the 3 layers in the Sequential network
    for i in range(3):
        seq2d_input = seq2d_net(seq2d_input)
    loss_seq2d = criterion(seq2d_input[3], target)  # Compute loss
    loss_seq2d.backward()              # Compute gradients via backpropagation
    optimizer_seq2d.step()             # Update weights
    print(f"   Sequential2D loss: {loss_seq2d.item():.4f}")
    
    print("\n3. After training - checking if networks still behave similarly:")
    with torch.no_grad():
        # Test on new data to see if both networks learned similarly
        test_x = torch.randn(16, input_size)
        
        out_seq_after = seq_net(test_x)
        
        seq2d_test = [test_x, None, None, None]
        for i in range(3):
            seq2d_test = seq2d_net(seq2d_test)
        out_seq2d_after = seq2d_test[3]
        
        diff_after = torch.max(torch.abs(out_seq_after - out_seq2d_after))
        print(f"   Output difference after training: {diff_after:.2e}")
        print(f"   Loss difference: {abs(loss_seq.item() - loss_seq2d.item()):.2e}")
    
    # ===== Interpretation =====
    if abs(loss_seq.item() - loss_seq2d.item()) < 1e-6:
        print(f"\n✓ SUCCESS: Both networks computed identical losses!")
        print(f"  This proves Sequential2D's gradient computation is correct.")
    else:
        print(f"\n✗ WARNING: Networks computed different losses!")
        print(f"  This suggests a bug in Sequential2D's implementation.")
    
    print(f"\n✓ Training test complete! Both 3-layer networks train equivalently.")

# Run the training test
test_training_equivalence()

Testing training equivalence with 3-layer networks...
Networks created with 118282 parameters each

1. Before training - checking outputs are identical:
   Output difference: 0.00e+00 (should be ~0)

2. Training both networks for one step:
   Sequential loss: 0.9985
   Sequential2D loss: 0.9985

3. After training - checking if networks still behave similarly:
   Output difference after training: 0.00e+00
   Loss difference: 0.00e+00

✓ SUCCESS: Both networks computed identical losses!
  This proves Sequential2D's gradient computation is correct.

✓ Training test complete! Both 3-layer networks train equivalently.


## Part 3: Unique Capabilities of Sequential2D

Now let's explore scenarios where Sequential2D offers capabilities that standard Sequential **cannot** provide.

### What Makes Sequential2D Special?

Sequential2D isn't just another way to implement Sequential networks. It enables entirely new architectures:

1. **Multiple Parallel Paths**: Data can flow through several paths simultaneously
2. **Skip Connections**: Information can jump across multiple layers
3. **Complex Connectivity**: Non-linear data flow patterns impossible with Sequential
4. **Block Matrix Structure**: Think of your network as a 2D grid of computational blocks

### Why This Matters

These capabilities allow you to implement architectures like:
- ResNet-style skip connections
- Multi-path networks (like Inception)
- Dynamical systems with feedback loops
- Networks with complex information flow patterns

Let's see this in action!

In [5]:
def create_complex_sequential2d():
    """
    Create a Sequential2D network with complex connectivity patterns
    that cannot be represented by standard Sequential
    
    This demonstrates the unique power of Sequential2D: creating networks
    with multiple parallel paths and skip connections that would be
    impossible to represent with PyTorch's Sequential container.
    """
    
    # Define a complex connectivity pattern using configuration
    # This creates a network with:
    # - Multiple input paths
    # - Skip connections between non-adjacent layers  
    # - Parallel processing paths
    # - Complex aggregation patterns
    cfg = { 
        'in_features_list': [50, 100, 200, 150],   # Input sizes for each "layer"
        'out_features_list': [100, 200, 150, 10], # Output sizes for each "layer"
        'block_types': [
            # Layer 0 -> Layer 1,2:   Two parallel paths from input
            ['Linear', 'Linear', None,     None],
            # Layer 1 -> Layer 2,3,4: Three parallel paths from first hidden layer  
            [None,     'Linear', 'Linear', 'Linear'],
            # Layer 2 -> Layer 3,4:   Two paths from second hidden layer
            [None,     None,     'Linear', 'Linear'],
            # Layer 3 -> Layer 4:     Single path to output
            [None,     None,     None,     'Linear']
        ]
    }

    complex_net = Sequential2D.from_config(cfg)
        
    return complex_net

complex_net = create_complex_sequential2d()

In [6]:
def demonstrate_complex_forward():
    """
    Demonstrate the complex forward pass of Sequential2D
    
    This shows how Sequential2D can handle both:
    1. Standard tensor input (automatic splitting across paths)
    2. List-based input (manual control over each path)
    
    Understanding both modes is crucial for using Sequential2D effectively.
    """
    batch_size = 16
    
    # ===== METHOD 1: SINGLE TENSOR INPUT =====
    # Sequential2D automatically splits the input across different paths
    # Total input size = sum of all input feature sizes
    total_input_size = 50 + 100 + 200 + 150  # = 500
    test_input = torch.randn(batch_size, total_input_size)
    
    print("=== Method 1: Single Tensor Input ===")
    print(f"Input shape: {test_input.shape}")
    
    # Forward pass through complex network
    output = complex_net(test_input)
    print(f"Final output shape: {output.shape}")
    
    # ===== METHOD 2: LIST-BASED INPUT =====
    # Manually specify input for each path (more control)
    print("\n=== Method 2: List-Based Input ===")
    print("Now we'll provide input only to the first blocks and see what happens...")
    
    # Only provide input to first path, None elsewhere
    input_list = [torch.randn(batch_size, 50), None, None, None]
    print("Input list:")
    for i, inp in enumerate(input_list):
        if inp is not None:
            print(f"  Path {i}: shape {inp.shape}")
        else:
            print(f"  Path {i}: None (no input)")
    
    # Use the special forward_list method for list inputs
    output_list = complex_net.forward_list(input_list)
    
    print(f"\nOutputs from each path:")
    for i, out in enumerate(output_list):
        if out is not None:
            print(f"  Path {i}: shape {out.shape}")
        else:
            print(f"  Path {i}: None (no output)")
        
    return output, output_list

output, output_list = demonstrate_complex_forward()

=== Method 1: Single Tensor Input ===
Input shape: torch.Size([16, 500])
Final output shape: torch.Size([16, 460])

=== Method 2: List-Based Input ===
Now we'll provide input only to the first blocks and see what happens...
Input list:
  Path 0: shape torch.Size([16, 50])
  Path 1: None (no input)
  Path 2: None (no input)
  Path 3: None (no input)

Outputs from each path:
  Path 0: shape torch.Size([16, 100])
  Path 1: shape torch.Size([16, 200])
  Path 2: None (no output)
  Path 3: None (no output)
