In [None]:
# 1. Download Fashion-MNIST
# 2. Normalize to mean=0, std=1 (or use standard transform)
# 3. Create train/test DataLoaders
# 4. Verify data shapes (should be [batch, 1, 28, 28])

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 1. Download Fashion-MNIST
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors, scales [0,255] to [0,1]
    transforms.Normalize((0.5,), (0.5,))  # Scale to [-1, 1], center at 0 (grayscale: single channel)
])
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform) # 60,000 images for training
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform) # 10,000 images for testing
# 3. Create train/test DataLoaders
batch_size = 1  # Part 1 baseline requires batch_size=1 (change to 32/64 for Part 2)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 4. Verify data shapes
# for images, labels in train_loader:
#     print(f'Batch of images shape: {images.shape}')  # Should be [batch_size, 1, 28, 28] for Fashion-MNIST
#     print(f'Batch of labels shape: {labels.shape}')  # Should be [batch_size]
#     break  # Just check the first batch


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 26.4M/26.4M [00:00<00:00, 114MB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 29.5k/29.5k [00:00<00:00, 3.93MB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4.42M/4.42M [00:00<00:00, 56.7MB/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5.15k/5.15k [00:00<00:00, 12.8MB/s]


In [None]:
# Build 2-Layer Network to prove that Fashion-MNIST requires deeper networks.
# **Purpose:** Prove that Fashion-MNIST needs deep learning

# **Architecture:**
# ```
# Input: 784 (28√ó28√ó1 flattened)
#   ‚Üì
# Linear(784 ‚Üí 256) + ReLU
#   ‚Üì
# Linear(256 ‚Üí 10) + Softmax
# ```

# **Implementation details:**
# - Use `torch.nn.Linear()` (allowed)
# - Implement sigmoid activation manually: `1 / (1 + torch.exp(-x))`
# - Implement softmax manually: `torch.exp(x) / torch.exp(x).sum()`
# - Implement cross-entropy loss manually

# **Training setup:**
# - Optimizer: SGD (implement manually)
# - Batch size: 1
# - Learning rate: 0.01 or 0.001
# - Epochs: 10-20

import torch
import torch.nn as nn
import torch.optim as optim

class TwoLayerNet(nn.Module):
    def sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))
    
    def softmax(self, x):
        exp_x = torch.exp(x - x.max(dim=1, keepdim=True)[0])  # Subtract max for stability
        return exp_x / exp_x.sum(dim=1, keepdim=True)
    
    def cross_entropy_loss(self, outputs, labels):
        # Convert labels to one-hot encoding
        one_hot_labels = torch.zeros_like(outputs)
        one_hot_labels.scatter_(1, labels.view(-1, 1), 1)
        
        # Compute cross-entropy loss with numerical stability
        # Add small epsilon to prevent log(0)
        log_probs = torch.log(self.softmax(outputs) + 1e-10)
        loss = -torch.sum(one_hot_labels * log_probs) / outputs.size(0)
        return loss
    
    def SGD_Optimizer(self, params, lr):
        with torch.no_grad():
            for param in params:
                if param.grad is not None:
                    param.data = param.data - lr * param.grad
                    param.grad = None  # Reset gradient

    def __init__(self):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(784, 256)   # Input: 28√ó28√ó1 = 784 (Fashion-MNIST)
        self.fc2 = nn.Linear(256, 10)    # Output: 10 classes

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.sigmoid(self.fc1(x))  # First layer + sigmoid
        x = self.fc2(x)  # Second layer (logits)
        return x
    


In [3]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TwoLayerNet().to(device)
learning_rate = 0.005  # Reduced from 0.01 to prevent instability
num_epochs = 10



In [4]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Reset gradients before backward pass
        for param in model.parameters():
            param.grad = None
        
        # Forward pass
        outputs = model(images)
        loss = model.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # Backward pass
        loss.backward()
        
        # Update weights using SGD
        model.SGD_Optimizer(model.parameters(), learning_rate)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%')

Epoch [1/10], Loss: 1.4423, Train Acc: 63.03%
Epoch [2/10], Loss: 0.8364, Train Acc: 73.64%
Epoch [3/10], Loss: 0.6978, Train Acc: 75.96%
Epoch [4/10], Loss: 0.6341, Train Acc: 77.65%
Epoch [5/10], Loss: 0.5933, Train Acc: 79.06%
Epoch [6/10], Loss: 0.5631, Train Acc: 80.16%
Epoch [7/10], Loss: 0.5405, Train Acc: 81.08%
Epoch [8/10], Loss: 0.5227, Train Acc: 81.70%
Epoch [9/10], Loss: 0.5083, Train Acc: 82.29%
Epoch [10/10], Loss: 0.4963, Train Acc: 82.61%


In [5]:
# test accuracy in percentage
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')


Test Accuracy: 81.37%


# Part 2, Step 4: Activation Functions Comparison

**Previous Result (Part 1):** 5-Layer CNN with Sigmoid failed completely (10% accuracy - vanishing gradients)

**Purpose:** Test modern activation functions to solve vanishing gradient problem

**New Activations Tested:**
1. **Leaky ReLU** - f(x) = max(x, 0.1x)
   - No saturation for positive values (gradient = 1)
   - Small gradient (0.1) for negative values prevents dying neurons
   - Used for: Conv layers 1, 2, 3

2. **Tanh** - f(x) = (e^x - e^-x) / (e^x + e^-x)
   - Zero-centered output range: (-1, 1)
   - Max gradient = 1 (vs sigmoid's 0.25)
   - Used for: FC layer 1

**Architecture (unchanged):**
- 3 Convolutional layers with MaxPooling (Conv: 3‚Üí16‚Üí32‚Üí64)
- 2 Fully connected layers (FC: 1024‚Üí256‚Üí10)
- Total: 5 parameterized layers, 288,554 parameters

**Expected result:** 60-70% test accuracy (dramatic improvement from 10%)

---
## üî¨ MISSING EXPERIMENT 1: Activation Function Comparison (batch_size=1)

**Purpose:** Demonstrate the vanishing gradient problem with sigmoid vs modern activations

We'll train TWO 5-layer CNNs with **batch_size=1** to compare:
1. **Sigmoid activation** - Expected to fail due to vanishing gradients (~10-20% accuracy)
2. **Leaky ReLU + Tanh** - Expected to succeed (~75-85% accuracy)

This comparison demonstrates why modern activation functions are essential for deep learning.

In [20]:
# Set batch_size=1 for activation function comparison
print("="*70)
print("EXPERIMENT: Activation Function Comparison (batch_size=1)")
print("="*70)

# Create DataLoaders with batch_size=1
train_loader_bs1 = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader_bs1 = DataLoader(test_dataset, batch_size=1, shuffle=False)

print(f"DataLoaders created with batch_size=1")
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}\n")

EXPERIMENT: Activation Function Comparison (batch_size=1)
DataLoaders created with batch_size=1
Training samples: 60000
Test samples: 10000



### Experiment 1a: 5-Layer CNN with SIGMOID (Vanishing Gradient Demo)

In [21]:
# 5-Layer CNN with SIGMOID activation (to demonstrate vanishing gradients)
class FiveLayerCNN_Sigmoid(nn.Module):
    def sigmoid(self, x):
        """Sigmoid activation: œÉ(x) = 1/(1+e^(-x))
        - Output range: (0, 1)
        - Max gradient: 0.25 (causes vanishing gradients!)
        """
        return 1 / (1 + torch.exp(-x))
    
    def softmax(self, x):
        exp_x = torch.exp(x - x.max(dim=1, keepdim=True)[0])
        return exp_x / exp_x.sum(dim=1, keepdim=True)
    
    def cross_entropy_loss(self, outputs, labels):
        one_hot_labels = torch.zeros_like(outputs)
        one_hot_labels.scatter_(1, labels.view(-1, 1), 1)
        log_probs = torch.log(self.softmax(outputs) + 1e-10)
        loss = -torch.sum(one_hot_labels * log_probs) / outputs.size(0)
        return loss
    
    def SGD_Optimizer(self, params, lr):
        with torch.no_grad():
            for param in params:
                if param.grad is not None:
                    param.data = param.data - lr * param.grad
                    param.grad = None
    
    def __init__(self):
        super(FiveLayerCNN_Sigmoid, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 3 * 3, 256)
        self.fc2 = nn.Linear(256, 10)
    
    def forward(self, x):
        # All layers use sigmoid activation
        x = self.sigmoid(self.conv1(x))
        x = self.pool(x)
        x = self.sigmoid(self.conv2(x))
        x = self.pool(x)
        x = self.sigmoid(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.sigmoid(self.fc1(x))
        x = self.fc2(x)
        return x

# Create sigmoid model
model_sigmoid = FiveLayerCNN_Sigmoid().to(device)
print("‚úÖ 5-Layer CNN with SIGMOID created")
print(f"   Parameters: {sum(p.numel() for p in model_sigmoid.parameters())}")

‚úÖ 5-Layer CNN with SIGMOID created
   Parameters: 173578


In [None]:
# Train sigmoid model (batch_size=1)
import time

print("\n" + "="*70)
print("TRAINING: 5-Layer CNN with SIGMOID (batch_size=1)")
print("="*70)
print("‚ö†Ô∏è  WARNING: This should fail due to vanishing gradients!")
print("   Expected accuracy: 10-20% (near random guessing)\n")

learning_rate_sigmoid = 0.005
num_epochs_sigmoid = 10

start_time = time.time()

for epoch in range(num_epochs_sigmoid):
    model_sigmoid.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader_bs1:
        images, labels = images.to(device), labels.to(device)
        
        for param in model_sigmoid.parameters():
            param.grad = None
        
        outputs = model_sigmoid(images)
        loss = model_sigmoid.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        loss.backward()
        model_sigmoid.SGD_Optimizer(model_sigmoid.parameters(), learning_rate_sigmoid)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader_bs1)
    elapsed = time.time() - start_time
    print(f'Epoch [{epoch+1}/{num_epochs_sigmoid}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed/60:.1f}min')

training_time_sigmoid = time.time() - start_time
print(f"\nTraining time: {training_time_sigmoid/60:.1f} minutes")

# Evaluate sigmoid model
model_sigmoid.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader_bs1:
        images, labels = images.to(device), labels.to(device)
        outputs = model_sigmoid(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy_sigmoid = 100 * correct / total

print("\n" + "="*70)
print("RESULTS: 5-Layer CNN with SIGMOID")
print("="*70)
print(f"Test Accuracy: {accuracy_sigmoid:.2f}%")
print(f"Training Time: {training_time_sigmoid/60:.1f} minutes")

if accuracy_sigmoid < 30:
    print("\n‚ùå VANISHING GRADIENT PROBLEM DEMONSTRATED!")
    print("   Sigmoid activations prevent effective learning in deep networks")
else:
    print("\n‚ö†Ô∏è  Unexpected: Sigmoid performed better than expected")
print("="*70)


TRAINING: 5-Layer CNN with SIGMOID (batch_size=1)
   Expected accuracy: 10-20% (near random guessing)



### Experiment 1b: 5-Layer CNN with LEAKY ReLU + TANH (batch_size=1)

In [None]:
# Create Leaky ReLU + Tanh model for batch_size=1 comparison
model_leakyrelu_bs1 = FiveLayerCNN().to(device)
print("\n‚úÖ 5-Layer CNN with LEAKY ReLU + TANH created")
print(f"   Parameters: {sum(p.numel() for p in model_leakyrelu_bs1.parameters())}")

In [None]:
# Train Leaky ReLU model (batch_size=1)
print("\n" + "="*70)
print("TRAINING: 5-Layer CNN with LEAKY ReLU + TANH (batch_size=1)")
print("="*70)
print("‚úÖ This should succeed with modern activation functions!")
print("   Expected accuracy: 75-85%\n")

learning_rate_leaky = 0.005
num_epochs_leaky = 10

start_time = time.time()

for epoch in range(num_epochs_leaky):
    model_leakyrelu_bs1.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader_bs1:
        images, labels = images.to(device), labels.to(device)
        
        for param in model_leakyrelu_bs1.parameters():
            param.grad = None
        
        outputs = model_leakyrelu_bs1(images)
        loss = model_leakyrelu_bs1.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        loss.backward()
        model_leakyrelu_bs1.SGD_Optimizer(model_leakyrelu_bs1.parameters(), learning_rate_leaky)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader_bs1)
    elapsed = time.time() - start_time
    print(f'Epoch [{epoch+1}/{num_epochs_leaky}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed/60:.1f}min')

training_time_leaky = time.time() - start_time
print(f"\nTraining time: {training_time_leaky/60:.1f} minutes")

# Evaluate Leaky ReLU model
model_leakyrelu_bs1.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader_bs1:
        images, labels = images.to(device), labels.to(device)
        outputs = model_leakyrelu_bs1(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy_leaky = 100 * correct / total

print("\n" + "="*70)
print("RESULTS: 5-Layer CNN with LEAKY ReLU + TANH")
print("="*70)
print(f"Test Accuracy: {accuracy_leaky:.2f}%")
print(f"Training Time: {training_time_leaky/60:.1f} minutes")

if accuracy_leaky > accuracy_sigmoid + 20:
    print("\n‚úÖ SUCCESS! Modern activations dramatically outperform sigmoid")
    print(f"   Improvement: +{accuracy_leaky - accuracy_sigmoid:.2f}%")
else:
    print("\n‚ö†Ô∏è  Unexpected: Smaller improvement than expected")
print("="*70)

In [None]:
# Summary comparison
print("\n" + "="*70)
print("üìä ACTIVATION FUNCTION COMPARISON SUMMARY (batch_size=1)")
print("="*70)
print(f"{'Model':<40} {'Test Acc':<12} {'Time':<10}")
print("-"*70)
print(f"{'5-Layer CNN with SIGMOID':<40} {accuracy_sigmoid:<12.2f} {training_time_sigmoid/60:.1f} min")
print(f"{'5-Layer CNN with LEAKY ReLU + TANH':<40} {accuracy_leaky:<12.2f} {training_time_leaky/60:.1f} min")
print("="*70)
print(f"\nüí° Key Insight:")
print(f"   Modern activation functions (Leaky ReLU, Tanh) enable deep learning")
print(f"   Sigmoid causes vanishing gradients in deep networks")
print(f"   Improvement: {accuracy_leaky - accuracy_sigmoid:+.2f} percentage points\n")

---
## ‚úÖ Activation Function Experiment Complete!

Now continuing with batch_size=32 experiments...

In [None]:
# Reset to batch_size=32 for remaining experiments
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print("="*70)
print("Switched to batch_size=32 for remaining experiments")
print("="*70)

In [6]:
# 5-Layer CNN Architecture
class FiveLayerCNN(nn.Module):
    def leaky_relu(self, x, negative_slope=0.1):
        """
        Leaky ReLU activation: f(x) = max(x, 0.1*x)
        - For x > 0: output = x (gradient = 1, no vanishing)
        - For x < 0: output = 0.1*x (gradient = 0.1, prevents dying neurons)
        """
        return torch.maximum(x, negative_slope * x)
    
    def tanh(self, x):
        """
        Hyperbolic tangent: f(x) = (e^x - e^-x) / (e^x + e^-x)
        - Output range: (-1, 1)
        - Zero-centered (better than sigmoid)
        - Max gradient = 1 (vs sigmoid's 0.25)
        """
        return torch.tanh(x)
    
    def softmax(self, x):
        exp_x = torch.exp(x - x.max(dim=1, keepdim=True)[0])
        return exp_x / exp_x.sum(dim=1, keepdim=True)
    
    def cross_entropy_loss(self, outputs, labels):
        one_hot_labels = torch.zeros_like(outputs)
        one_hot_labels.scatter_(1, labels.view(-1, 1), 1)
        log_probs = torch.log(self.softmax(outputs) + 1e-10)
        loss = -torch.sum(one_hot_labels * log_probs) / outputs.size(0)
        return loss
    
    def SGD_Optimizer(self, params, lr, momentum=0.0):
        """
        SGD with momentum implementation following equation (8):
        m_{i+1} = Œ± * m_i + g_i
        Œ∏_{i+1} = Œ∏_i - Œ∑ * m_{i+1}
        
        Args:
            params: Model parameters
            lr: Learning rate (Œ∑)
            momentum: Momentum coefficient (Œ±), default=0.0 for vanilla SGD
        """
        # Initialize momentum buffer on first call
        if not hasattr(self, 'momentum_buffer'):
            self.momentum_buffer = {}
        
        with torch.no_grad():
            for param in params:
                if param.grad is not None:
                    # Get parameter id for momentum buffer
                    param_id = id(param)
                    
                    # Initialize momentum to zero if not exists
                    if param_id not in self.momentum_buffer:
                        self.momentum_buffer[param_id] = torch.zeros_like(param.data)
                    
                    # Get current gradient (g_i)
                    grad = param.grad
                    
                    # Update momentum: m_{i+1} = Œ± * m_i + g_i
                    self.momentum_buffer[param_id] = momentum * self.momentum_buffer[param_id] + grad
                    
                    # Update parameters: Œ∏_{i+1} = Œ∏_i - Œ∑ * m_{i+1}
                    param.data = param.data - lr * self.momentum_buffer[param_id]
                    
                    # Reset gradient
                    param.grad = None
    
    def __init__(self):
        super(FiveLayerCNN, self).__init__()
        # Convolutional layers (3 parameterized layers)
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)   # Layer 1: 1‚Üí16 channels (grayscale)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)          # MaxPool (not parameterized)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)  # Layer 2: 16‚Üí32 channels
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # Layer 3: 32‚Üí64 channels
        
        # Fully connected layers (2 parameterized layers)
        self.fc1 = nn.Linear(64 * 3 * 3, 256)  # Layer 4: After 3 pooling ops, 28√ó28‚Üí14√ó14‚Üí7√ó7‚Üí3√ó3
        self.fc2 = nn.Linear(256, 10)           # Layer 5: Output layer
    
    def forward(self, x):
        # Input: [batch, 1, 28, 28] for Fashion-MNIST
        
        # Conv block 1 - using Leaky ReLU
        x = self.conv1(x)           # [batch, 16, 28, 28]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 16, 14, 14]
        
        # Conv block 2 - using Leaky ReLU
        x = self.conv2(x)           # [batch, 32, 14, 14]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 32, 7, 7]
        
        # Conv block 3 - using Leaky ReLU
        x = self.conv3(x)           # [batch, 64, 7, 7]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 64, 3, 3]
        
        # Flatten
        x = x.view(x.size(0), -1)   # [batch, 576]
        
        # Fully connected layers - using Tanh
        x = self.fc1(x)             # [batch, 256]
        x = self.tanh(x)            # Tanh activation
        x = self.fc2(x)             # [batch, 10] - logits
        
        return x

# Create the model
cnn_model = FiveLayerCNN().to(device)
print(f"Model created on device: {device}")
print(f"Total parameters: {sum(p.numel() for p in cnn_model.parameters())}")

Model created on device: cuda
Total parameters: 173578


In [7]:
# Training setup for CNN
cnn_learning_rate = 0.005  # Increased from 0.005 - Leaky ReLU and Tanh have better gradient flow
cnn_num_epochs = 10  # More epochs for deeper network

print(f"Training Configuration:")
print(f"  Learning rate: {cnn_learning_rate}")
print(f"  Epochs: {cnn_num_epochs}")
print(f"  Batch size: {batch_size}")
print(f"  Device: {device}")
print(f"  Activations: Leaky ReLU (conv layers) + Tanh (FC layer)")
print(f"\nStarting training...")

Training Configuration:
  Learning rate: 0.005
  Epochs: 10
  Batch size: 32
  Device: cuda
  Activations: Leaky ReLU (conv layers) + Tanh (FC layer)

Starting training...


In [8]:
# Training loop for CNN
import time

start_time = time.time()

for epoch in range(cnn_num_epochs):
    cnn_model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Reset gradients
        for param in cnn_model.parameters():
            param.grad = None
        
        # Forward pass
        outputs = cnn_model(images)
        loss = cnn_model.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        cnn_model.SGD_Optimizer(cnn_model.parameters(), cnn_learning_rate)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader)
    
    # Print progress every epoch
    elapsed_time = time.time() - start_time
    print(f'Epoch [{epoch+1}/{cnn_num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed_time/60:.1f}min')

print(f"\nTotal training time: {(time.time() - start_time)/60:.1f} minutes")

Epoch [1/10], Loss: 1.4667, Train Acc: 51.77%, Time: 0.3min
Epoch [2/10], Loss: 0.6559, Train Acc: 75.72%, Time: 0.7min
Epoch [3/10], Loss: 0.5499, Train Acc: 79.56%, Time: 1.0min
Epoch [4/10], Loss: 0.4942, Train Acc: 81.75%, Time: 1.3min
Epoch [5/10], Loss: 0.4532, Train Acc: 83.31%, Time: 1.7min
Epoch [6/10], Loss: 0.4210, Train Acc: 84.48%, Time: 2.0min
Epoch [7/10], Loss: 0.3961, Train Acc: 85.42%, Time: 2.3min
Epoch [8/10], Loss: 0.3763, Train Acc: 86.25%, Time: 2.7min
Epoch [9/10], Loss: 0.3593, Train Acc: 86.74%, Time: 3.0min
Epoch [10/10], Loss: 0.3444, Train Acc: 87.26%, Time: 3.3min

Total training time: 3.3 minutes


In [9]:
# Evaluate CNN on test set
cnn_model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = cnn_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

cnn_test_accuracy = 100 * correct / total
print(f'\n{"="*50}')
print(f'5-Layer CNN Test Accuracy: {cnn_test_accuracy:.2f}%')
print(f'{"="*50}')

# Compare with 2-layer network
print(f'\nüìä Comparison:')
print(f'  2-Layer Network: 48.04%')
print(f'  5-Layer CNN:     {cnn_test_accuracy:.2f}%')
print(f'  Improvement:     {cnn_test_accuracy - 48.04:.2f}%')

if cnn_test_accuracy > 50:
    print(f'\n‚úÖ SUCCESS: Deep network achieves >{50}% accuracy!')
    print(f'‚úÖ This proves CIFAR-10 requires deep learning!')


5-Layer CNN Test Accuracy: 85.57%

üìä Comparison:
  2-Layer Network: 48.04%
  5-Layer CNN:     85.57%
  Improvement:     37.53%

‚úÖ SUCCESS: Deep network achieves >50% accuracy!
‚úÖ This proves CIFAR-10 requires deep learning!


---
## ‚úÖ Part 1 Complete!

You now have:
1. ‚úÖ **2-Layer Network** - 48.04% test accuracy (proves shallow networks struggle)
2. ‚úÖ **5-Layer CNN** - Should get 50-60% test accuracy (proves depth helps)

### üéØ Expected Training Time:
- With batch_size=1 and 30 epochs: **~2-3 hours**
- Each epoch processes 50,000 images individually

### üí° Tips:
- The training will take a while - be patient!
- Loss should steadily decrease
- Accuracy should improve over 2-layer baseline
- You can reduce epochs to 20 if you're short on time

### üìù Next Steps (Part 2):
After this training completes, you'll:
1. Test different activation functions (Leaky ReLU, Tanh)
2. Implement mini-batch SGD (batch sizes: 16, 32, 64, 128)
3. Add momentum to the optimizer

---
# Part 2, Step 6: Mini-Batch SGD with Momentum

**Previous Results:**
- Vanilla SGD with batch_size=1: 65.24% test accuracy (29.9 min)
- Mini-batch SGD with batch_size=32: 59.17% test accuracy (2.8 min)

**Purpose:** Add momentum to accelerate learning and smooth optimization

**Momentum Algorithm (Equation 8):**
- **m‚ÇÅ = 0** (initialize momentum to zero)
- **g·µ¢ = (1/b) Œ£ ‚àáŒ∏·µ¢L‚Çñ** (compute gradient)
- **m·µ¢‚Çä‚ÇÅ = Œ±¬∑m·µ¢ + g·µ¢** (update momentum with Œ± = momentum coefficient)
- **Œ∏·µ¢‚Çä‚ÇÅ = Œ∏ - Œ∑¬∑m·µ¢‚Çä‚ÇÅ** (update parameters using momentum)

**How Momentum Helps:**
- Accumulates a moving average of gradients
- Dampens oscillations in directions with high curvature
- Accelerates progress along consistent descent directions
- Think: Rolling ball gaining speed downhill

**Configuration:**
- Architecture: 5-Layer CNN with Leaky ReLU + Tanh
- Batch size: 32 (from previous experiment)
- Learning rate: 0.005
- **Momentum (Œ±): Testing 3 values: 0.7, 0.9, 0.95**
- Epochs: 10

**Expected:** Faster convergence, smoother training, potentially higher accuracy than vanilla SGD. Higher Œ± should provide more smoothing but may overshoot.

In [None]:
# Test multiple momentum values
import time

# Test different momentum coefficients (Œ±) - UPDATED to include 0.5 and 0.99
alpha_values = [0.5, 0.7, 0.9, 0.95, 0.99]
momentum_results = {}

learning_rate = 0.005
num_epochs = 10

print(f"Testing momentum with Œ± values: {alpha_values}")
print(f"Each training will take ~3.3 minutes")
print(f"Total expected time: ~16.5 minutes\n")
print("="*70)

Testing momentum with Œ± values: [0.7, 0.9, 0.95]
Each training will take ~2.8 minutes
Total expected time: ~8.5 minutes



In [11]:
# Train and evaluate for each momentum value
for alpha in alpha_values:
    print(f"\n{'='*70}")
    print(f"üîπ Training with Momentum Œ± = {alpha}")
    print(f"{'='*70}\n")
    
    # Create fresh model for this alpha value
    model_momentum = FiveLayerCNN().to(device)
    print(f"Model created with {sum(p.numel() for p in model_momentum.parameters())} parameters")
    print(f"Configuration: lr={learning_rate}, Œ±={alpha}, epochs={num_epochs}, batch_size={batch_size}\n")
    
    # Training loop
    start_time = time.time()
    
    for epoch in range(num_epochs):
        model_momentum.train()
        total_loss = 0
        correct_train = 0
        total_train = 0
        
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Reset gradients
            for param in model_momentum.parameters():
                param.grad = None
            
            # Forward pass
            outputs = model_momentum(images)
            loss = model_momentum.cross_entropy_loss(outputs, labels)
            total_loss += loss.item()
            
            # Calculate training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()
            
            # Backward pass
            loss.backward()
            
            # Update weights with momentum
            model_momentum.SGD_Optimizer(model_momentum.parameters(), 
                                         learning_rate, 
                                         momentum=alpha)
        
        train_acc = 100 * correct_train / total_train
        avg_loss = total_loss / len(train_loader)
        
        # Print progress every epoch
        elapsed_time = time.time() - start_time
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed_time/60:.1f}min')
    
    training_time = time.time() - start_time
    print(f"\nTraining time: {training_time/60:.1f} minutes")
    
    # Evaluation
    model_momentum.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model_momentum(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    test_accuracy = 100 * correct / total
    
    # Store results
    momentum_results[alpha] = {
        'test_accuracy': test_accuracy,
        'train_accuracy': train_acc,
        'final_loss': avg_loss,
        'training_time': training_time/60
    }
    
    print(f"\n‚úÖ Test Accuracy with Œ±={alpha}: {test_accuracy:.2f}%")
    print(f"{'='*70}\n")

print("\n" + "="*70)
print("üéØ MOMENTUM EXPERIMENTS COMPLETE")
print("="*70)


üîπ Training with Momentum Œ± = 0.7

Model created with 173578 parameters
Configuration: lr=0.005, Œ±=0.7, epochs=10, batch_size=32

Epoch [1/10], Loss: 0.8720, Train Acc: 68.57%, Time: 0.3min
Epoch [2/10], Loss: 0.4657, Train Acc: 82.77%, Time: 0.7min
Epoch [3/10], Loss: 0.3827, Train Acc: 85.98%, Time: 1.0min
Epoch [4/10], Loss: 0.3388, Train Acc: 87.42%, Time: 1.3min
Epoch [5/10], Loss: 0.3137, Train Acc: 88.42%, Time: 1.7min
Epoch [6/10], Loss: 0.2929, Train Acc: 89.11%, Time: 2.0min
Epoch [7/10], Loss: 0.2779, Train Acc: 89.58%, Time: 2.3min
Epoch [8/10], Loss: 0.2643, Train Acc: 90.26%, Time: 2.6min
Epoch [9/10], Loss: 0.2512, Train Acc: 90.65%, Time: 3.0min
Epoch [10/10], Loss: 0.2410, Train Acc: 91.00%, Time: 3.3min

Training time: 3.3 minutes

‚úÖ Test Accuracy with Œ±=0.7: 89.39%


üîπ Training with Momentum Œ± = 0.9

Model created with 173578 parameters
Configuration: lr=0.005, Œ±=0.9, epochs=10, batch_size=32

Epoch [1/10], Loss: 0.6181, Train Acc: 77.20%, Time: 0.3min
E

In [12]:
# Display comprehensive results comparison
print("\n" + "="*80)
print("üìä MOMENTUM EXPERIMENTS - COMPREHENSIVE RESULTS")
print("="*80)
print(f"{'Alpha (Œ±)':<12} {'Test Acc':<12} {'Train Acc':<12} {'Loss':<12} {'Time (min)':<12}")
print("-"*80)

best_alpha = None
best_accuracy = 0

for alpha in alpha_values:
    results = momentum_results[alpha]
    print(f"{alpha:<12.2f} {results['test_accuracy']:<12.2f} {results['train_accuracy']:<12.2f} " 
          f"{results['final_loss']:<12.4f} {results['training_time']:<12.1f}")
    
    if results['test_accuracy'] > best_accuracy:
        best_accuracy = results['test_accuracy']
        best_alpha = alpha

print("="*80)
print(f"\nüèÜ Best Momentum Value: Œ± = {best_alpha} with {best_accuracy:.2f}% test accuracy\n")

# Compare with baseline (mini-batch SGD without momentum)
print("="*80)
print("üìà COMPARISON WITH PREVIOUS EXPERIMENTS")
print("="*80)
print(f"{'Experiment':<45} {'Test Accuracy':<15} {'Training Time':<15}")
print("-"*80)
print(f"{'Vanilla SGD (batch_size=1)':<45} {65.24:<15.2f} {'29.9 min':<15}")
print(f"{'Mini-batch SGD (batch_size=32, no momentum)':<45} {cnn_test_accuracy:<15.2f} {'2.8 min':<15}")

for alpha in alpha_values:
    results = momentum_results[alpha]
    print(f"{f'Mini-batch SGD with Momentum (Œ±={alpha})':<45} "
          f"{results['test_accuracy']:<15.2f} {results['training_time']:.1f} min")

print("="*80)

# Analysis
print("\nüìù ANALYSIS:")
print(f"‚Ä¢ Best momentum value: Œ± = {best_alpha} with {best_accuracy:.2f}% test accuracy")
print(f"‚Ä¢ Improvement over no momentum: {best_accuracy - cnn_test_accuracy:+.2f}%")
print(f"‚Ä¢ Momentum helps by accumulating gradients in consistent directions")
print(f"‚Ä¢ Higher Œ± values preserve more history, lower values allow faster adaptation")
if best_accuracy > cnn_test_accuracy:
    print(f"‚Ä¢ ‚úÖ Momentum successfully improved performance!")
print("="*80)


üìä MOMENTUM EXPERIMENTS - COMPREHENSIVE RESULTS
Alpha (Œ±)    Test Acc     Train Acc    Loss         Time (min)  
--------------------------------------------------------------------------------
0.70         89.39        91.00        0.2410       3.3         
0.90         90.91        93.61        0.1698       3.3         
0.95         91.05        94.17        0.1516       3.3         

üèÜ Best Momentum Value: Œ± = 0.95 with 91.05% test accuracy

üìà COMPARISON WITH PREVIOUS EXPERIMENTS
Experiment                                    Test Accuracy   Training Time  
--------------------------------------------------------------------------------
Vanilla SGD (batch_size=1)                    65.24           29.9 min       
Mini-batch SGD (batch_size=32, no momentum)   85.57           2.8 min        
Mini-batch SGD with Momentum (Œ±=0.7)          89.39           3.3 min
Mini-batch SGD with Momentum (Œ±=0.9)          90.91           3.3 min
Mini-batch SGD with Momentum (Œ±=0.95)      

---

# PART 3: SKIP CONNECTIONS (Residual Learning)

## Motivation
As networks become deeper, training becomes more difficult due to:
1. **Vanishing gradients**: Gradients become exponentially small in early layers
2. **Degradation problem**: Deep networks can perform worse than shallow ones (not due to overfitting)
3. **Optimization difficulty**: Deeper networks are harder to optimize

**Skip connections (Residual connections)** solve this by:
- Creating shortcuts that bypass layers
- Allowing gradients to flow directly backward
- Learning residual functions F(x) = H(x) - x instead of H(x) directly
- Enabling very deep networks (50, 100, even 1000+ layers)

## Experimental Design

We will conduct **THREE experiments** with a 15-layer deep CNN:
1. **Extended model WITHOUT skip connections** (plain deep network) - Baseline to observe degradation
2. **Extended model WITH skip connections - Configuration 1** - First skip connection strategy (3 skips)
3. **Extended model WITH skip connections - Configuration 2** - Second skip connection strategy (3 skips)

**Assignment Requirement:** Add 10 layers to our 5-layer CNN (making it 15 layers total), then test with two different skip connection configurations, each using 3 skip connections.

**Hypothesis:** 
- The plain 15-layer network may suffer from degradation (perform worse than 5-layer)
- Skip connections will enable effective training and improve gradient flow
- Different skip configurations may yield different performance

### Architecture Design

**15-Layer CNN Structure (Extended Model):**
- **Block 1:** Conv 3‚Üí16, then 3√ó Conv 16‚Üí16 (same shape, ideal for skip connections)
- **Block 2:** Conv 16‚Üí32, then 3√ó Conv 32‚Üí32 (same shape, ideal for skip connections)
- **Block 3:** Conv 32‚Üí64, then 3√ó Conv 64‚Üí64 (same shape, ideal for skip connections)
- **Block 4:** FC 1024‚Üí256, FC 256‚Üí256, FC 256‚Üí10
- **Total:** 15 parameterized layers (12 conv + 3 FC) = 5-layer baseline + 10 additional layers

### Skip Connection Configurations

**Configuration 1: Short Skips (Length 1)** - ResNet-style, frequent gradient shortcuts
- Skip 1: conv1 output ‚Üí conv2 output (y = x ‚äï f(x), skips 1 layer)
- Skip 2: conv5 output ‚Üí conv6 output (y = x ‚äï f(x), skips 1 layer) 
- Skip 3: conv9 output ‚Üí conv10 output (y = x ‚äï f(x), skips 1 layer)

**Configuration 2: Longer Skips (Length 2-3)** - Deeper gradient shortcuts
- Skip 1: conv1 output ‚Üí conv3 output (y = x ‚äï g(f(x)), skips 2 layers)
- Skip 2: conv5 output ‚Üí conv8 output (y = x ‚äï h(g(f(x))), skips 3 layers)
- Skip 3: conv9 output ‚Üí conv12 output (y = x ‚äï h(g(f(x))), skips 3 layers)

**Training Configuration:**
- Use best hyperparameters from Part 2: batch_size=32, momentum=0.9, lr=0.005
- Train for 10 epochs for all three models
- Record average L1-norm of gradients during first epoch for analysis

In [13]:
# 15-Layer Deep CNN WITHOUT Skip Connections
import torch
import torch.nn as nn

class DeepCNN15_NoSkip(nn.Module):
    def __init__(self):
        super(DeepCNN15_NoSkip, self).__init__()
        
        # Block 1: 1‚Üí16‚Üí16‚Üí16‚Üí16 (grayscale input)
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        
        # Block 2: 16‚Üí32‚Üí32‚Üí32‚Üí32
        self.conv5 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv6 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv7 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv8 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        # Block 3: 32‚Üí64‚Üí64‚Üí64‚Üí64
        self.conv9 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv10 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv11 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv12 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(2, 2)
        
        # Fully connected layers: 576‚Üí256‚Üí256‚Üí10 (28√ó28‚Üí14√ó14‚Üí7√ó7‚Üí3√ó3)
        self.fc1 = nn.Linear(64 * 3 * 3, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 10)
        
    def leaky_relu(self, x, alpha=0.01):
        """Custom Leaky ReLU activation"""
        return torch.where(x > 0, x, alpha * x)
    
    def tanh(self, x):
        """Custom Tanh activation"""
        return torch.tanh(x)
    
    def forward(self, x):
        # Block 1: 4 conv layers
        x = self.leaky_relu(self.conv1(x))
        x = self.leaky_relu(self.conv2(x))
        x = self.leaky_relu(self.conv3(x))
        x = self.leaky_relu(self.conv4(x))
        x = self.pool1(x)
        
        # Block 2: 4 conv layers
        x = self.leaky_relu(self.conv5(x))
        x = self.leaky_relu(self.conv6(x))
        x = self.leaky_relu(self.conv7(x))
        x = self.leaky_relu(self.conv8(x))
        x = self.pool2(x)
        
        # Block 3: 4 conv layers
        x = self.leaky_relu(self.conv9(x))
        x = self.leaky_relu(self.conv10(x))
        x = self.leaky_relu(self.conv11(x))
        x = self.leaky_relu(self.conv12(x))
        x = self.pool3(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Fully connected layers
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        
        return x
    
    def cross_entropy_loss(self, predictions, targets):
        """Manual cross-entropy loss implementation"""
        predictions = predictions - predictions.max(dim=1, keepdim=True)[0]
        exp_pred = torch.exp(predictions)
        softmax = exp_pred / exp_pred.sum(dim=1, keepdim=True)
        log_softmax = torch.log(softmax + 1e-8)
        loss = -log_softmax[range(len(targets)), targets].mean()
        return loss
    
    def SGD_Optimizer(self, params, lr, momentum=0.0):
        """Manual SGD optimizer with momentum"""
        if not hasattr(self, 'momentum_buffer'):
            self.momentum_buffer = {}
        
        for param in params:
            if param.grad is not None:
                param_id = id(param)
                
                if param_id not in self.momentum_buffer:
                    self.momentum_buffer[param_id] = torch.zeros_like(param.data)
                
                # m_{i+1} = Œ± * m_i + g_i
                self.momentum_buffer[param_id] = (
                    momentum * self.momentum_buffer[param_id] + param.grad.data
                )
                
                # Œ∏_{i+1} = Œ∏_i - Œ∑ * m_{i+1}
                param.data = param.data - lr * self.momentum_buffer[param_id]

# Create and test the model
model_deep15_no_skip = DeepCNN15_NoSkip().to(device)
total_params = sum(p.numel() for p in model_deep15_no_skip.parameters())
print("="*70)
print("15-LAYER DEEP CNN (NO SKIP CONNECTIONS)")
print("="*70)
print(f"Total parameters: {total_params:,}")
print("\nArchitecture:")
print("  Block 1: Conv 3‚Üí16, 3√ó Conv 16‚Üí16, MaxPool")
print("  Block 2: Conv 16‚Üí32, 3√ó Conv 32‚Üí32, MaxPool")
print("  Block 3: Conv 32‚Üí64, 3√ó Conv 64‚Üí64, MaxPool")
print("  Block 4: FC 1024‚Üí256, FC 256‚Üí256, FC 256‚Üí10")
print("\nTotal: 15 parameterized layers (12 conv + 3 FC)")
print("Activation: Leaky ReLU (conv), Tanh (FC)")
print("="*70)

15-LAYER DEEP CNN (NO SKIP CONNECTIONS)
Total parameters: 384,858

Architecture:
  Block 1: Conv 3‚Üí16, 3√ó Conv 16‚Üí16, MaxPool
  Block 2: Conv 16‚Üí32, 3√ó Conv 32‚Üí32, MaxPool
  Block 3: Conv 32‚Üí64, 3√ó Conv 64‚Üí64, MaxPool
  Block 4: FC 1024‚Üí256, FC 256‚Üí256, FC 256‚Üí10

Total: 15 parameterized layers (12 conv + 3 FC)
Activation: Leaky ReLU (conv), Tanh (FC)


In [14]:
# Train 15-layer CNN WITHOUT skip connections
import time

print("\n" + "="*70)
print("TRAINING 15-LAYER CNN (NO SKIP CONNECTIONS)")
print("="*70)
print(f"Configuration: lr=0.005, momentum=0.9, batch_size=32, epochs=10")
print("Recording gradient L1-norms during first epoch...\n")

learning_rate = 0.005
momentum_alpha = 0.9
num_epochs = 10

gradient_norms_no_skip = []  # Store gradient norms for first epoch

start_time = time.time()

for epoch in range(num_epochs):
    model_deep15_no_skip.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for batch_idx, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        
        # Reset gradients
        for param in model_deep15_no_skip.parameters():
            param.grad = None
        
        # Forward pass
        outputs = model_deep15_no_skip(images)
        loss = model_deep15_no_skip.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # Backward pass
        loss.backward()
        
        # Record gradient L1-norms during first epoch
        if epoch == 0:
            batch_grad_norm = 0
            for param in model_deep15_no_skip.parameters():
                if param.grad is not None:
                    batch_grad_norm += torch.sum(torch.abs(param.grad)).item()
            gradient_norms_no_skip.append(batch_grad_norm)
        
        # Update weights with momentum
        model_deep15_no_skip.SGD_Optimizer(model_deep15_no_skip.parameters(), 
                                            learning_rate, 
                                            momentum=momentum_alpha)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader)
    elapsed_time = time.time() - start_time
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed_time/60:.1f}min')

training_time_no_skip = time.time() - start_time

# Calculate average gradient norm from first epoch
avg_grad_norm_no_skip = sum(gradient_norms_no_skip) / len(gradient_norms_no_skip)

print(f"\n‚úÖ Training complete!")
print(f"Training time: {training_time_no_skip/60:.1f} minutes")
print(f"Average gradient L1-norm (epoch 1): {avg_grad_norm_no_skip:.2f}")
print("="*70)


TRAINING 15-LAYER CNN (NO SKIP CONNECTIONS)
Configuration: lr=0.005, momentum=0.9, batch_size=32, epochs=10
Recording gradient L1-norms during first epoch...

Epoch [1/10], Loss: 2.3032, Train Acc: 9.92%, Time: 0.6min
Epoch [2/10], Loss: 2.3034, Train Acc: 9.91%, Time: 1.0min
Epoch [3/10], Loss: 2.3033, Train Acc: 9.87%, Time: 1.5min
Epoch [4/10], Loss: 2.3033, Train Acc: 10.12%, Time: 2.0min
Epoch [5/10], Loss: 2.3033, Train Acc: 9.83%, Time: 2.5min
Epoch [6/10], Loss: 2.3031, Train Acc: 10.05%, Time: 3.0min
Epoch [7/10], Loss: 2.3032, Train Acc: 10.04%, Time: 3.4min
Epoch [8/10], Loss: 2.3032, Train Acc: 9.86%, Time: 3.9min
Epoch [9/10], Loss: 2.3033, Train Acc: 9.93%, Time: 4.4min
Epoch [10/10], Loss: 2.3032, Train Acc: 9.76%, Time: 4.9min

‚úÖ Training complete!
Training time: 4.9 minutes
Average gradient L1-norm (epoch 1): 18.19


In [15]:
# Evaluate 15-layer CNN WITHOUT skip connections
model_deep15_no_skip.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model_deep15_no_skip(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_acc_no_skip = 100 * correct / total

print("\n" + "="*70)
print("15-LAYER CNN (NO SKIP) - RESULTS")
print("="*70)
print(f"Test Accuracy: {test_acc_no_skip:.2f}%")
print(f"Training Accuracy: {train_acc:.2f}%")
print(f"Generalization Gap: {train_acc - test_acc_no_skip:.2f}%")
print(f"Average Gradient Norm (epoch 1): {avg_grad_norm_no_skip:.2f}")
print("\nüìä Comparison with 5-layer CNN:")
print(f"  5-layer with momentum:  {best_accuracy:.2f}%")
print(f"  15-layer no skip:       {test_acc_no_skip:.2f}%")
print(f"  Difference:             {test_acc_no_skip - best_accuracy:+.2f}%")

if test_acc_no_skip < best_accuracy:
    print("\n‚ö†Ô∏è  DEGRADATION OBSERVED: Deeper network performs worse!")
    print("    This demonstrates the degradation problem in plain deep networks.")
else:
    print("\n‚úÖ Deeper network improved performance!")
print("="*70)


15-LAYER CNN (NO SKIP) - RESULTS
Test Accuracy: 10.00%
Training Accuracy: 9.76%
Generalization Gap: -0.24%
Average Gradient Norm (epoch 1): 18.19

üìä Comparison with 5-layer CNN:
  5-layer with momentum:  91.05%
  15-layer no skip:       10.00%
  Difference:             -81.05%

‚ö†Ô∏è  DEGRADATION OBSERVED: Deeper network performs worse!
    This demonstrates the degradation problem in plain deep networks.


In [16]:
# 15-Layer Deep CNN WITH Skip Connections - CONFIGURATION 1 (Short Skips, Length 1)
import torch
import torch.nn as nn

class DeepCNN15_SkipConfig1(nn.Module):
    """
    Configuration 1: Short skip connections (length 1)
    - Skip 1: conv1 ‚Üí conv2 (skips 1 layer)
    - Skip 2: conv5 ‚Üí conv6 (skips 1 layer)
    - Skip 3: conv9 ‚Üí conv10 (skips 1 layer)
    Total: 3 skip connections (ResNet-style)
    """
    def __init__(self):
        super(DeepCNN15_SkipConfig1, self).__init__()
        
        # Block 1: 1‚Üí16‚Üí16‚Üí16‚Üí16 (grayscale input)
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        
        # Block 2: 16‚Üí32‚Üí32‚Üí32‚Üí32
        self.conv5 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv6 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv7 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv8 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        # Block 3: 32‚Üí64‚Üí64‚Üí64‚Üí64
        self.conv9 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv10 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv11 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv12 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(2, 2)
        
        # Fully connected layers: 576‚Üí256‚Üí256‚Üí10 (28√ó28‚Üí14√ó14‚Üí7√ó7‚Üí3√ó3)
        self.fc1 = nn.Linear(64 * 3 * 3, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 10)
        
    def leaky_relu(self, x, alpha=0.01):
        """Custom Leaky ReLU activation"""
        return torch.where(x > 0, x, alpha * x)
    
    def tanh(self, x):
        """Custom Tanh activation"""
        return torch.tanh(x)
    
    def forward(self, x):
        # Block 1 with SHORT skip (length 1): conv1 ‚Üí conv2
        identity1 = self.leaky_relu(self.conv1(x))  # Save for skip
        x = self.leaky_relu(self.conv2(identity1))
        x = x + identity1  # Skip connection: y = x ‚äï f(x)
        x = self.leaky_relu(self.conv3(x))
        x = self.leaky_relu(self.conv4(x))
        x = self.pool1(x)
        
        # Block 2 with SHORT skip (length 1): conv5 ‚Üí conv6
        identity2 = self.leaky_relu(self.conv5(x))  # Save for skip
        x = self.leaky_relu(self.conv6(identity2))
        x = x + identity2  # Skip connection: y = x ‚äï f(x)
        x = self.leaky_relu(self.conv7(x))
        x = self.leaky_relu(self.conv8(x))
        x = self.pool2(x)
        
        # Block 3 with SHORT skip (length 1): conv9 ‚Üí conv10
        identity3 = self.leaky_relu(self.conv9(x))  # Save for skip
        x = self.leaky_relu(self.conv10(identity3))
        x = x + identity3  # Skip connection: y = x ‚äï f(x)
        x = self.leaky_relu(self.conv11(x))
        x = self.leaky_relu(self.conv12(x))
        x = self.pool3(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # FC layers (no skip connections in this configuration)
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        
        return x
    
    def cross_entropy_loss(self, predictions, targets):
        """Manual cross-entropy loss implementation"""
        predictions = predictions - predictions.max(dim=1, keepdim=True)[0]
        exp_pred = torch.exp(predictions)
        softmax = exp_pred / exp_pred.sum(dim=1, keepdim=True)
        log_softmax = torch.log(softmax + 1e-8)
        loss = -log_softmax[range(len(targets)), targets].mean()
        return loss
    
    def SGD_Optimizer(self, params, lr, momentum=0.0):
        """Manual SGD optimizer with momentum"""
        if not hasattr(self, 'momentum_buffer'):
            self.momentum_buffer = {}
        
        for param in params:
            if param.grad is not None:
                param_id = id(param)
                
                if param_id not in self.momentum_buffer:
                    self.momentum_buffer[param_id] = torch.zeros_like(param.data)
                
                # m_{i+1} = Œ± * m_i + g_i
                self.momentum_buffer[param_id] = (
                    momentum * self.momentum_buffer[param_id] + param.grad.data
                )
                
                # Œ∏_{i+1} = Œ∏_i - Œ∑ * m_{i+1}
                param.data = param.data - lr * self.momentum_buffer[param_id]

# Create and test Config 1 model
model_skip_config1 = DeepCNN15_SkipConfig1().to(device)
total_params_c1 = sum(p.numel() for p in model_skip_config1.parameters())
print("="*70)
print("15-LAYER CNN - SKIP CONFIGURATION 1 (SHORT SKIPS)")
print("="*70)
print(f"Total parameters: {total_params_c1:,}")
print("\nSkip Connection Configuration 1:")
print("  Skip 1: conv1 output ‚Üí conv2 output (length 1, y = x ‚äï f(x))")
print("  Skip 2: conv5 output ‚Üí conv6 output (length 1, y = x ‚äï f(x))")
print("  Skip 3: conv9 output ‚Üí conv10 output (length 1, y = x ‚äï f(x))")
print("\nTotal: 3 skip connections (all length 1)")
print("Strategy: Frequent, short gradientpathways (ResNet-style)")
print("="*70)

15-LAYER CNN - SKIP CONFIGURATION 1 (SHORT SKIPS)
Total parameters: 384,858

Skip Connection Configuration 1:
  Skip 1: conv1 output ‚Üí conv2 output (length 1, y = x ‚äï f(x))
  Skip 2: conv5 output ‚Üí conv6 output (length 1, y = x ‚äï f(x))
  Skip 3: conv9 output ‚Üí conv10 output (length 1, y = x ‚äï f(x))

Total: 3 skip connections (all length 1)
Strategy: Frequent, short gradientpathways (ResNet-style)


In [17]:
# 15-Layer Deep CNN WITH Skip Connections - CONFIGURATION 2 (Longer Skips, Length 2-3)
import torch
import torch.nn as nn

class DeepCNN15_SkipConfig2(nn.Module):
    """
    Configuration 2: Longer skip connections (length 2-3)
    - Skip 1: conv1 ‚Üí conv3 (skips 2 layers)
    - Skip 2: conv5 ‚Üí conv8 (skips 3 layers)
    - Skip 3: conv9 ‚Üí conv12 (skips 3 layers)
    Total: 3 skip connections
    """
    def __init__(self):
        super(DeepCNN15_SkipConfig2, self).__init__()
        
        # Block 1: 1‚Üí16‚Üí16‚Üí16‚Üí16 (grayscale input)
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(16, 16, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        
        # Block 2: 16‚Üí32‚Üí32‚Üí32‚Üí32
        self.conv5 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv6 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv7 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.conv8 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        # Block 3: 32‚Üí64‚Üí64‚Üí64‚Üí64
        self.conv9 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv10 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv11 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.conv12 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(2, 2)
        
        # Fully connected layers: 576‚Üí256‚Üí256‚Üí10 (28√ó28‚Üí14√ó14‚Üí7√ó7‚Üí3√ó3)
        self.fc1 = nn.Linear(64 * 3 * 3, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 10)
        
    def leaky_relu(self, x, alpha=0.01):
        """Custom Leaky ReLU activation"""
        return torch.where(x > 0, x, alpha * x)
    
    def tanh(self, x):
        """Custom Tanh activation"""
        return torch.tanh(x)
    
    def forward(self, x):
        # Block 1 with LONGER skip (length 2): conv1 ‚Üí conv3
        identity1 = self.leaky_relu(self.conv1(x))  # Save for skip
        x = self.leaky_relu(self.conv2(identity1))
        x = self.leaky_relu(self.conv3(x))
        x = x + identity1  # Skip connection: y = x ‚äï g(f(x))
        x = self.leaky_relu(self.conv4(x))
        x = self.pool1(x)
        
        # Block 2 with LONGER skip (length 3): conv5 ‚Üí conv8
        identity2 = self.leaky_relu(self.conv5(x))  # Save for skip
        x = self.leaky_relu(self.conv6(identity2))
        x = self.leaky_relu(self.conv7(x))
        x = self.leaky_relu(self.conv8(x))
        x = x + identity2  # Skip connection: y = x ‚äï h(g(f(x)))
        x = self.pool2(x)
        
        # Block 3 with LONGER skip (length 3): conv9 ‚Üí conv12
        identity3 = self.leaky_relu(self.conv9(x))  # Save for skip
        x = self.leaky_relu(self.conv10(identity3))
        x = self.leaky_relu(self.conv11(x))
        x = self.leaky_relu(self.conv12(x))
        x = x + identity3  # Skip connection: y = x ‚äï h(g(f(x)))
        x = self.pool3(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # FC layers (no skip connections in this configuration)
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        
        return x
    
    def cross_entropy_loss(self, predictions, targets):
        """Manual cross-entropy loss implementation"""
        predictions = predictions - predictions.max(dim=1, keepdim=True)[0]
        exp_pred = torch.exp(predictions)
        softmax = exp_pred / exp_pred.sum(dim=1, keepdim=True)
        log_softmax = torch.log(softmax + 1e-8)
        loss = -log_softmax[range(len(targets)), targets].mean()
        return loss
    
    def SGD_Optimizer(self, params, lr, momentum=0.0):
        """Manual SGD optimizer with momentum"""
        if not hasattr(self, 'momentum_buffer'):
            self.momentum_buffer = {}
        
        for param in params:
            if param.grad is not None:
                param_id = id(param)
                
                if param_id not in self.momentum_buffer:
                    self.momentum_buffer[param_id] = torch.zeros_like(param.data)
                
                # m_{i+1} = Œ± * m_i + g_i
                self.momentum_buffer[param_id] = (
                    momentum * self.momentum_buffer[param_id] + param.grad.data
                )
                
                # Œ∏_{i+1} = Œ∏_i - Œ∑ * m_{i+1}
                param.data = param.data - lr * self.momentum_buffer[param_id]

# Create and test Config 2 model
model_skip_config2 = DeepCNN15_SkipConfig2().to(device)
total_params_c2 = sum(p.numel() for p in model_skip_config2.parameters())
print("\n" + "="*70)
print("15-LAYER CNN - SKIP CONFIGURATION 2 (LONGER SKIPS)")
print("="*70)
print(f"Total parameters: {total_params_c2:,}")
print("\nSkip Connection Configuration 2:")
print("  Skip 1: conv1 output ‚Üí conv3 output (length 2, y = x ‚äï g(f(x)))")
print("  Skip 2: conv5 output ‚Üí conv8 output (length 3, y = x ‚äï h(g(f(x))))")
print("  Skip 3: conv9 output ‚Üí conv12 output (length 3, y = x ‚äï h(g(f(x))))")
print("\nTotal: 3 skip connections (1√ó length 2, 2√ó length 3)")
print("Strategy: Sparser, longer gradient pathways")
print("="*70)


15-LAYER CNN - SKIP CONFIGURATION 2 (LONGER SKIPS)
Total parameters: 384,858

Skip Connection Configuration 2:
  Skip 1: conv1 output ‚Üí conv3 output (length 2, y = x ‚äï g(f(x)))
  Skip 2: conv5 output ‚Üí conv8 output (length 3, y = x ‚äï h(g(f(x))))
  Skip 3: conv9 output ‚Üí conv12 output (length 3, y = x ‚äï h(g(f(x))))

Total: 3 skip connections (1√ó length 2, 2√ó length 3)
Strategy: Sparser, longer gradient pathways


In [18]:
# Train all THREE models: No skip, Config 1, Config 2
import time

print("\n" + "="*80)
print("PART 3: TRAINING THREE 15-LAYER MODELS")
print("="*80)
print("Configuration: lr=0.005, momentum=0.9, batch_size=32, epochs=10\n")

learning_rate = 0.005
momentum_alpha = 0.9
num_epochs = 10

# Dictionary to store results for all 3 models
deep_results = {
    'no_skip': {},
    'config1': {},
    'config2': {}
}

# Model configurations
models_to_train = [
    ('no_skip', model_deep15_no_skip, "NO SKIP CONNECTIONS (Baseline)"),
    ('config1', model_skip_config1, "SKIP CONFIG 1 (Short Skips)"),
    ('config2', model_skip_config2, "SKIP CONFIG 2 (Longer Skips)")
]

for model_key, model, model_name in models_to_train:
    print("\n" + "="*70)
    print(f"üîπ TRAINING: {model_name}")
    print("="*70)
    
    gradient_norms = []  # Store gradient norms for first epoch
    start_time = time.time()
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct_train = 0
        total_train = 0
        
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            
            # Reset gradients
            for param in model.parameters():
                param.grad = None
            
            # Forward pass
            outputs = model(images)
            loss = model.cross_entropy_loss(outputs, labels)
            total_loss += loss.item()
            
            # Calculate training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()
            
            # Backward pass
            loss.backward()
            
            # Record gradient L1-norms during first epoch
            if epoch == 0:
                batch_grad_norm = 0
                for param in model.parameters():
                    if param.grad is not None:
                        batch_grad_norm += torch.sum(torch.abs(param.grad)).item()
                gradient_norms.append(batch_grad_norm)
            
            # Update weights with momentum
            model.SGD_Optimizer(model.parameters(), learning_rate, momentum=momentum_alpha)
        
        train_acc = 100 * correct_train / total_train
        avg_loss = total_loss / len(train_loader)
        elapsed_time = time.time() - start_time
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed_time/60:.1f}min')
    
    training_time = time.time() - start_time
    avg_grad_norm = sum(gradient_norms) / len(gradient_norms)
    
    # Evaluate on test set
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    test_acc = 100 * correct / total
    
    # Store results
    deep_results[model_key] = {
        'test_accuracy': test_acc,
        'train_accuracy': train_acc,
        'final_loss': avg_loss,
        'training_time': training_time/60,
        'avg_grad_norm': avg_grad_norm
    }
    
    print(f"\n‚úÖ Training complete!")
    print(f"Test Accuracy: {test_acc:.2f}%")
    print(f"Average Gradient L1-norm (epoch 1): {avg_grad_norm:.2f}")
    print(f"Training time: {training_time/60:.1f} minutes")
    print("="*70)

print("\n" + "="*80)
print("üéØ ALL THREE MODELS TRAINED SUCCESSFULLY")
print("="*80)


PART 3: TRAINING THREE 15-LAYER MODELS
Configuration: lr=0.005, momentum=0.9, batch_size=32, epochs=10


üîπ TRAINING: NO SKIP CONNECTIONS (Baseline)
Epoch [1/10], Loss: 2.3030, Train Acc: 10.15%, Time: 0.6min
Epoch [2/10], Loss: 2.3031, Train Acc: 9.89%, Time: 1.1min
Epoch [3/10], Loss: 2.3031, Train Acc: 10.07%, Time: 1.5min
Epoch [4/10], Loss: 2.3032, Train Acc: 9.70%, Time: 2.0min
Epoch [5/10], Loss: 2.3031, Train Acc: 9.85%, Time: 2.5min
Epoch [6/10], Loss: 2.3030, Train Acc: 9.96%, Time: 3.0min
Epoch [7/10], Loss: 2.3030, Train Acc: 9.97%, Time: 3.5min
Epoch [8/10], Loss: 1.2517, Train Acc: 52.12%, Time: 3.9min
Epoch [9/10], Loss: 0.4835, Train Acc: 81.81%, Time: 4.4min
Epoch [10/10], Loss: 0.3813, Train Acc: 85.64%, Time: 4.9min

‚úÖ Training complete!
Test Accuracy: 86.02%
Average Gradient L1-norm (epoch 1): 10.62
Training time: 4.9 minutes

üîπ TRAINING: SKIP CONFIG 1 (Short Skips)
Epoch [1/10], Loss: 2.3033, Train Acc: 9.80%, Time: 0.6min
Epoch [2/10], Loss: 2.3033, Train 

In [19]:
# PART 3: Comprehensive Comparison and Analysis
print("\n" + "="*80)
print("PART 3: COMPREHENSIVE RESULTS - SKIP CONNECTIONS ANALYSIS")
print("="*80)

# Extract results for easier access
no_skip = deep_results['no_skip']
config1 = deep_results['config1']
config2 = deep_results['config2']

print("\nüìä MODEL COMPARISON TABLE:")
print(f"{'Model':<45} {'Test Acc':<12} {'Train Acc':<12} {'Grad Norm':<15} {'Time':<10}")
print("-"*95)
print(f"{'5-layer CNN (Part 2 baseline)':<45} {best_accuracy:<12.2f} {'-':<12} {'-':<15} {'2.8 min':<10}")
print(f"{'15-layer WITHOUT skip connections':<45} {no_skip['test_accuracy']:<12.2f} {no_skip['train_accuracy']:<12.2f} {no_skip['avg_grad_norm']:<15.2f} {no_skip['training_time']:.1f} min")
print(f"{'15-layer WITH skip - Config 1 (short)':<45} {config1['test_accuracy']:<12.2f} {config1['train_accuracy']:<12.2f} {config1['avg_grad_norm']:<15.2f} {config1['training_time']:.1f} min")
print(f"{'15-layer WITH skip - Config 2 (longer)':<45} {config2['test_accuracy']:<12.2f} {config2['train_accuracy']:<12.2f} {config2['avg_grad_norm']:<15.2f} {config2['training_time']:.1f} min")
print("="*95)

print("\nüìà GRADIENT FLOW ANALYSIS:")
print(f"  No skip connections:       {no_skip['avg_grad_norm']:.2f}")
print(f"  Config 1 (short skips):    {config1['avg_grad_norm']:.2f}  (ratio: {config1['avg_grad_norm']/no_skip['avg_grad_norm']:.2f}x)")
print(f"  Config 2 (longer skips):   {config2['avg_grad_norm']:.2f}  (ratio: {config2['avg_grad_norm']/no_skip['avg_grad_norm']:.2f}x)")

if config1['avg_grad_norm'] > no_skip['avg_grad_norm']:
    improvement = ((config1['avg_grad_norm']/no_skip['avg_grad_norm']) - 1) * 100
    print(f"\n  ‚úÖ Config 1 increased gradient magnitudes by {improvement:.1f}%")
if config2['avg_grad_norm'] > no_skip['avg_grad_norm']:
    improvement = ((config2['avg_grad_norm']/no_skip['avg_grad_norm']) - 1) * 100
    print(f"  ‚úÖ Config 2 increased gradient magnitudes by {improvement:.1f}%")
print(f"  ‚Üí Skip connections combat vanishing gradients!")

print("\nüìä ACCURACY ANALYSIS:")
print(f"  Extended model (no skip) vs 5-layer:  {no_skip['test_accuracy'] - best_accuracy:+.2f}%")
print(f"  Config 1 vs No Skip:                   {config1['test_accuracy'] - no_skip['test_accuracy']:+.2f}% improvement")
print(f"  Config 2 vs No Skip:                   {config2['test_accuracy'] - no_skip['test_accuracy']:+.2f}% improvement")
print(f"  Config 1 vs 5-layer baseline:          {config1['test_accuracy'] - best_accuracy:+.2f}%")
print(f"  Config 2 vs 5-layer baseline:          {config2['test_accuracy'] - best_accuracy:+.2f}%")

# Determine best skip configuration
best_skip_model = 'config1' if config1['test_accuracy'] >= config2['test_accuracy'] else 'config2'
best_skip_name = "Config 1 (short skips)" if best_skip_model == 'config1' else "Config 2 (longer skips)"
best_skip_acc = deep_results[best_skip_model]['test_accuracy']

print(f"\nüèÜ BEST SKIP CONFIGURATION: {best_skip_name} with {best_skip_acc:.2f}%")

print("\nüîç KEY FINDINGS:")

# Finding 1: Degradation problem
if no_skip['test_accuracy'] < best_accuracy:
    print(f"  1. ‚ö†Ô∏è  DEGRADATION PROBLEM OBSERVED:")
    print(f"      ‚Ä¢ 15-layer without skip: {no_skip['test_accuracy']:.2f}%")
    print(f"      ‚Ä¢ 5-layer baseline:      {best_accuracy:.2f}%")
    print(f"      ‚Ä¢ Difference:            {no_skip['test_accuracy'] - best_accuracy:.2f}%")
    print(f"      ‚Üí Adding layers WITHOUT skip connections HURTS performance!")
    print(f"      ‚Üí This is NOT overfitting - it's an optimization failure")
else:
    print(f"  1. Extended model (no skip) achieved {no_skip['test_accuracy']:.2f}%")
    if no_skip['test_accuracy'] > best_accuracy:
        print(f"      ‚Üí Deeper network improved over 5-layer!")
    else:
        print(f"      ‚Üí Similar performance to 5-layer baseline")

# Finding 2: Skip connections solve degradation
if config1['test_accuracy'] > no_skip['test_accuracy'] or config2['test_accuracy'] > no_skip['test_accuracy']:
    print(f"  2. ‚úÖ SKIP CONNECTIONS SOLVE DEGRADATION:")
    print(f"      ‚Ä¢ Config 1 (short):  {config1['test_accuracy']:.2f}% ({config1['test_accuracy'] - no_skip['test_accuracy']:+.2f}% vs no skip)")
    print(f"      ‚Ä¢ Config 2 (longer): {config2['test_accuracy']:.2f}% ({config2['test_accuracy'] - no_skip['test_accuracy']:+.2f}% vs no skip)")
    print(f"      ‚Üí Skip connections enable training of deep networks!")

# Finding 3: Gradient flow improvement
grad_improve_c1 = (config1['avg_grad_norm'] / no_skip['avg_grad_norm'] - 1) * 100
grad_improve_c2 = (config2['avg_grad_norm'] / no_skip['avg_grad_norm'] - 1) * 100
print(f"  3. üìä GRADIENT FLOW IMPROVEMENT:")
print(f"      ‚Ä¢ Config 1: {grad_improve_c1:+.1f}% stronger gradients")
print(f"      ‚Ä¢ Config 2: {grad_improve_c2:+.1f}% stronger gradients")
print(f"      ‚Üí Skip connections provide gradient highways to early layers")

# Finding 4: Configuration comparison
print(f"  4. üî¨ SKIP CONFIGURATION COMPARISON:")
if config1['test_accuracy'] > config2['test_accuracy']:
    diff = config1['test_accuracy'] - config2['test_accuracy']
    print(f"      ‚Ä¢ Config 1 (short/frequent) outperformed Config 2 by {diff:.2f}%")
    print(f"      ‚Üí Frequent short skips (ResNet-style) work better for this architecture")
    print(f"      ‚Üí More gradient pathways = better optimization")
elif config2['test_accuracy'] > config1['test_accuracy']:
    diff = config2['test_accuracy'] - config1['test_accuracy']
    print(f"      ‚Ä¢ Config 2 (longer/sparser) outperformed Config 1 by {diff:.2f}%")
    print(f"      ‚Üí Longer skips provide stronger gradient flow")
    print(f"      ‚Üí Deeper shortcuts may be more effective")
else:
    print(f"      ‚Ä¢ Both configurations achieved similar performance ({config1['test_accuracy']:.2f}%)")
    print(f"      ‚Üí Both skip strategies are effective for gradient flow")

# Check if skip connections beat 5-layer baseline
if best_skip_acc > best_accuracy:
    improvement = best_skip_acc - best_accuracy
    print(f"  5. ‚úÖ DEEP NETWORK SUCCESS:")
    print(f"      ‚Ä¢ Best 15-layer (with skip): {best_skip_acc:.2f}%")
    print(f"      ‚Ä¢ 5-layer baseline:          {best_accuracy:.2f}%")
    print(f"      ‚Ä¢ Improvement:               +{improvement:.2f}%")
    print(f"      ‚Üí Skip connections enabled a deeper network to outperform shallow!")
elif best_skip_acc >= best_accuracy - 2:
    print(f"  5. ‚úì COMPETITIVE PERFORMANCE:")
    print(f"      ‚Ä¢ Best 15-layer (with skip): {best_skip_acc:.2f}%")
    print(f"      ‚Ä¢ 5-layer baseline:          {best_accuracy:.2f}%")
    print(f"      ‚Üí Skip connections enabled training a 3√ó deeper network")
else:
    print(f"  5. ‚ö†Ô∏è  Still below 5-layer baseline, but skip connections helped significantly")

print("\nüí° CONCLUSIONS:")
print("  ‚Ä¢ Skip connections are ESSENTIAL for training deep neural networks")
print("  ‚Ä¢ They solve the degradation problem by providing gradient highways")
print("  ‚Ä¢ Stronger gradient flow ‚Üí Better optimization ‚Üí Higher accuracy")
print(f"  ‚Ä¢ Best approach: {best_skip_name}")
print(f"  ‚Ä¢ Best 15-layer model: {best_skip_acc:.2f}% test accuracy")
print("  ‚Ä¢ Without skip connections, adding layers can hurt performance!")
print("="*80)


PART 3: COMPREHENSIVE RESULTS - SKIP CONNECTIONS ANALYSIS

üìä MODEL COMPARISON TABLE:
Model                                         Test Acc     Train Acc    Grad Norm       Time      
-----------------------------------------------------------------------------------------------
5-layer CNN (Part 2 baseline)                 91.05        -            -               2.8 min   
15-layer WITHOUT skip connections             86.02        85.64        10.62           4.9 min
15-layer WITH skip - Config 1 (short)         89.68        91.49        18.59           5.0 min
15-layer WITH skip - Config 2 (longer)        90.35        93.49        696.42          4.9 min

üìà GRADIENT FLOW ANALYSIS:
  No skip connections:       10.62
  Config 1 (short skips):    18.59  (ratio: 1.75x)
  Config 2 (longer skips):   696.42  (ratio: 65.55x)

  ‚úÖ Config 1 increased gradient magnitudes by 75.0%
  ‚úÖ Config 2 increased gradient magnitudes by 6454.9%
  ‚Üí Skip connections combat vanishing gradients