In [11]:
# 1. Download CIFAR-10
# 2. Normalize to mean=0, std=1 (or use standard transform)
# 3. Create train/test DataLoaders
# 4. Verify data shapes (should be [batch, 3, 32, 32])

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 1. Download CIFAR-10
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors, scales [0,255] to [0,1]
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Scale to [-1, 1], center at 0
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) # predefined 50,000 images for training
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) # predefined 10,000 images for testing
# 3. Create train/test DataLoaders
batch_size = 1  # Part 1 baseline requires batch_size=1 (change to 32/64 for Part 2)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 4. Verify data shapes
# for images, labels in train_loader:
#     print(f'Batch of images shape: {images.shape}')  # Should be [batch_size, 3, 32, 32] 64 images per batch, 3 channels, 32x32 pixels
#     print(f'Batch of labels shape: {labels.shape}')  # Should be [batch_size]
#     break  # Just check the first batch


In [12]:
# Build 2-Layer Network to prove that cifar10 requires deeper networks.
# **Purpose:** Prove that CIFAR-10 needs deep learning

# **Architecture:**
# ```
# Input: 3072 (32√ó32√ó3 flattened)
#   ‚Üì
# Linear(3072 ‚Üí 128) + Sigmoid
#   ‚Üì
# Linear(128 ‚Üí 10) + Softmax
# ```

# **Implementation details:**
# - Use `torch.nn.Linear()` (allowed)
# - Implement sigmoid activation manually: `1 / (1 + torch.exp(-x))`
# - Implement softmax manually: `torch.exp(x) / torch.exp(x).sum()`
# - Implement cross-entropy loss manually

# **Training setup:**
# - Optimizer: SGD (implement manually)
# - Batch size: 1
# - Learning rate: 0.01 or 0.001
# - Epochs: 10-20

import torch
import torch.nn as nn
import torch.optim as optim

class TwoLayerNet(nn.Module):
    def sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))
    
    def softmax(self, x):
        exp_x = torch.exp(x - x.max(dim=1, keepdim=True)[0])  # Subtract max for stability
        return exp_x / exp_x.sum(dim=1, keepdim=True)
    
    def cross_entropy_loss(self, outputs, labels):
        # Convert labels to one-hot encoding
        one_hot_labels = torch.zeros_like(outputs)
        one_hot_labels.scatter_(1, labels.view(-1, 1), 1)
        
        # Compute cross-entropy loss with numerical stability
        # Add small epsilon to prevent log(0)
        log_probs = torch.log(self.softmax(outputs) + 1e-10)
        loss = -torch.sum(one_hot_labels * log_probs) / outputs.size(0)
        return loss
    
    def SGD_Optimizer(self, params, lr):
        with torch.no_grad():
            for param in params:
                if param.grad is not None:
                    param.data = param.data - lr * param.grad
                    param.grad = None  # Reset gradient

    def __init__(self):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(3072, 128)  # Input: 32√ó32√ó3 = 3072
        self.fc2 = nn.Linear(128, 10)    # Output: 10 classes

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.sigmoid(self.fc1(x))  # First layer + sigmoid
        x = self.fc2(x)  # Second layer (logits)
        return x
    


In [13]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TwoLayerNet().to(device)
learning_rate = 0.005  # Reduced from 0.01 to prevent instability
num_epochs = 10



In [14]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Reset gradients before backward pass
        for param in model.parameters():
            param.grad = None
        
        # Forward pass
        outputs = model(images)
        loss = model.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # Backward pass
        loss.backward()
        
        # Update weights using SGD
        model.SGD_Optimizer(model.parameters(), learning_rate)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%')

Epoch [1/10], Loss: 1.7889, Train Acc: 36.52%
Epoch [2/10], Loss: 1.6314, Train Acc: 43.13%
Epoch [3/10], Loss: 1.5447, Train Acc: 46.24%
Epoch [4/10], Loss: 1.4780, Train Acc: 48.73%
Epoch [5/10], Loss: 1.4221, Train Acc: 50.33%
Epoch [6/10], Loss: 1.3684, Train Acc: 52.39%
Epoch [7/10], Loss: 1.3216, Train Acc: 54.13%
Epoch [8/10], Loss: 1.2791, Train Acc: 55.42%
Epoch [9/10], Loss: 1.2420, Train Acc: 56.61%
Epoch [10/10], Loss: 1.2081, Train Acc: 58.00%


In [16]:
# test accuracy in percentage
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')


Test Accuracy: 48.04%


# Part 2, Step 4: Activation Functions Comparison

**Previous Result (Part 1):** 5-Layer CNN with Sigmoid failed completely (10% accuracy - vanishing gradients)

**Purpose:** Test modern activation functions to solve vanishing gradient problem

**New Activations Tested:**
1. **Leaky ReLU** - f(x) = max(x, 0.1x)
   - No saturation for positive values (gradient = 1)
   - Small gradient (0.1) for negative values prevents dying neurons
   - Used for: Conv layers 1, 2, 3

2. **Tanh** - f(x) = (e^x - e^-x) / (e^x + e^-x)
   - Zero-centered output range: (-1, 1)
   - Max gradient = 1 (vs sigmoid's 0.25)
   - Used for: FC layer 1

**Architecture (unchanged):**
- 3 Convolutional layers with MaxPooling (Conv: 3‚Üí16‚Üí32‚Üí64)
- 2 Fully connected layers (FC: 1024‚Üí256‚Üí10)
- Total: 5 parameterized layers, 288,554 parameters

**Expected result:** 60-70% test accuracy (dramatic improvement from 10%)

In [None]:
# 5-Layer CNN Architecture
class FiveLayerCNN(nn.Module):
    def leaky_relu(self, x, negative_slope=0.1):
        """
        Leaky ReLU activation: f(x) = max(x, 0.1*x)
        - For x > 0: output = x (gradient = 1, no vanishing)
        - For x < 0: output = 0.1*x (gradient = 0.1, prevents dying neurons)
        """
        return torch.maximum(x, negative_slope * x)
    
    def tanh(self, x):
        """
        Hyperbolic tangent: f(x) = (e^x - e^-x) / (e^x + e^-x)
        - Output range: (-1, 1)
        - Zero-centered (better than sigmoid)
        - Max gradient = 1 (vs sigmoid's 0.25)
        """
        return torch.tanh(x)
    
    def softmax(self, x):
        exp_x = torch.exp(x - x.max(dim=1, keepdim=True)[0])
        return exp_x / exp_x.sum(dim=1, keepdim=True)
    
    def cross_entropy_loss(self, outputs, labels):
        one_hot_labels = torch.zeros_like(outputs)
        one_hot_labels.scatter_(1, labels.view(-1, 1), 1)
        log_probs = torch.log(self.softmax(outputs) + 1e-10)
        loss = -torch.sum(one_hot_labels * log_probs) / outputs.size(0)
        return loss
    
    def SGD_Optimizer(self, params, lr):
        with torch.no_grad():
            for param in params:
                if param.grad is not None:
                    param.data = param.data - lr * param.grad
                    param.grad = None
    
    def __init__(self):
        super(FiveLayerCNN, self).__init__()
        # Convolutional layers (3 parameterized layers)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)   # Layer 1: 3‚Üí16 channels
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)          # MaxPool (not parameterized)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)  # Layer 2: 16‚Üí32 channels
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # Layer 3: 32‚Üí64 channels
        
        # Fully connected layers (2 parameterized layers)
        self.fc1 = nn.Linear(64 * 4 * 4, 256)  # Layer 4: After 3 pooling ops, 32√ó32‚Üí4√ó4
        self.fc2 = nn.Linear(256, 10)           # Layer 5: Output layer
    
    def forward(self, x):
        # Input: [batch, 3, 32, 32]
        
        # Conv block 1 - using Leaky ReLU
        x = self.conv1(x)           # [batch, 16, 32, 32]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 16, 16, 16]
        
        # Conv block 2 - using Leaky ReLU
        x = self.conv2(x)           # [batch, 32, 16, 16]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 32, 8, 8]
        
        # Conv block 3 - using Leaky ReLU
        x = self.conv3(x)           # [batch, 64, 8, 8]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 64, 4, 4]
        
        # Flatten
        x = x.view(x.size(0), -1)   # [batch, 1024]
        
        # Fully connected layers - using Tanh
        x = self.fc1(x)             # [batch, 256]
        x = self.tanh(x)            # Tanh activation
        x = self.fc2(x)             # [batch, 10] - logits
        
        return x

# Create the model
cnn_model = FiveLayerCNN().to(device)
print(f"Model created on device: {device}")
print(f"Total parameters: {sum(p.numel() for p in cnn_model.parameters())}")

Model created on device: cuda
Total parameters: 288554


In [None]:
# Training setup for CNN
cnn_learning_rate = 0.01  # Increased from 0.005 - Leaky ReLU and Tanh have better gradient flow
cnn_num_epochs = 10  # More epochs for deeper network

print(f"Training Configuration:")
print(f"  Learning rate: {cnn_learning_rate}")
print(f"  Epochs: {cnn_num_epochs}")
print(f"  Batch size: {batch_size}")
print(f"  Device: {device}")
print(f"  Activations: Leaky ReLU (conv layers) + Tanh (FC layer)")
print(f"\nStarting training...")

Training Configuration:
  Learning rate: 0.005
  Epochs: 10
  Batch size: 1
  Device: cuda

Starting training...


In [19]:
# Training loop for CNN
import time

start_time = time.time()

for epoch in range(cnn_num_epochs):
    cnn_model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Reset gradients
        for param in cnn_model.parameters():
            param.grad = None
        
        # Forward pass
        outputs = cnn_model(images)
        loss = cnn_model.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        cnn_model.SGD_Optimizer(cnn_model.parameters(), cnn_learning_rate)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader)
    
    # Print progress every epoch
    elapsed_time = time.time() - start_time
    print(f'Epoch [{epoch+1}/{cnn_num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed_time/60:.1f}min')

print(f"\nTotal training time: {(time.time() - start_time)/60:.1f} minutes")

Epoch [1/10], Loss: 2.3099, Train Acc: 9.99%, Time: 3.1min
Epoch [2/10], Loss: 2.3042, Train Acc: 9.97%, Time: 6.3min
Epoch [3/10], Loss: 2.3039, Train Acc: 9.80%, Time: 9.4min
Epoch [4/10], Loss: 2.3041, Train Acc: 9.75%, Time: 12.5min
Epoch [5/10], Loss: 2.3038, Train Acc: 10.16%, Time: 15.6min
Epoch [6/10], Loss: 2.3039, Train Acc: 9.99%, Time: 18.7min
Epoch [7/10], Loss: 2.3038, Train Acc: 10.07%, Time: 21.8min
Epoch [8/10], Loss: 2.3038, Train Acc: 10.15%, Time: 25.0min
Epoch [9/10], Loss: 2.3038, Train Acc: 9.87%, Time: 28.1min
Epoch [10/10], Loss: 2.3038, Train Acc: 10.00%, Time: 31.3min

Total training time: 31.3 minutes


In [20]:
# Evaluate CNN on test set
cnn_model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = cnn_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

cnn_test_accuracy = 100 * correct / total
print(f'\n{"="*50}')
print(f'5-Layer CNN Test Accuracy: {cnn_test_accuracy:.2f}%')
print(f'{"="*50}')

# Compare with 2-layer network
print(f'\nüìä Comparison:')
print(f'  2-Layer Network: 48.04%')
print(f'  5-Layer CNN:     {cnn_test_accuracy:.2f}%')
print(f'  Improvement:     {cnn_test_accuracy - 48.04:.2f}%')

if cnn_test_accuracy > 50:
    print(f'\n‚úÖ SUCCESS: Deep network achieves >{50}% accuracy!')
    print(f'‚úÖ This proves CIFAR-10 requires deep learning!')


5-Layer CNN Test Accuracy: 10.00%

üìä Comparison:
  2-Layer Network: 48.04%
  5-Layer CNN:     10.00%
  Improvement:     -38.04%


---
## ‚úÖ Part 1 Complete!

You now have:
1. ‚úÖ **2-Layer Network** - 48.04% test accuracy (proves shallow networks struggle)
2. ‚úÖ **5-Layer CNN** - Should get 50-60% test accuracy (proves depth helps)

### üéØ Expected Training Time:
- With batch_size=1 and 30 epochs: **~2-3 hours**
- Each epoch processes 50,000 images individually

### üí° Tips:
- The training will take a while - be patient!
- Loss should steadily decrease
- Accuracy should improve over 2-layer baseline
- You can reduce epochs to 20 if you're short on time

### üìù Next Steps (Part 2):
After this training completes, you'll:
1. Test different activation functions (Leaky ReLU, Tanh)
2. Implement mini-batch SGD (batch sizes: 16, 32, 64, 128)
3. Add momentum to the optimizer