In [1]:
# 1. Download CIFAR-10
# 2. Normalize to mean=0, std=1 (or use standard transform)
# 3. Create train/test DataLoaders
# 4. Verify data shapes (should be [batch, 3, 32, 32])

import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 1. Download CIFAR-10
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors, scales [0,255] to [0,1]
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Scale to [-1, 1], center at 0
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) # predefined 50,000 images for training
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) # predefined 10,000 images for testing
# 3. Create train/test DataLoaders
batch_size = 32  # Part 1 baseline requires batch_size=1 (change to 32/64 for Part 2)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 4. Verify data shapes
# for images, labels in train_loader:
#     print(f'Batch of images shape: {images.shape}')  # Should be [batch_size, 3, 32, 32] 64 images per batch, 3 channels, 32x32 pixels
#     print(f'Batch of labels shape: {labels.shape}')  # Should be [batch_size]
#     break  # Just check the first batch


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 170M/170M [00:12<00:00, 13.3MB/s] 


In [2]:
# Build 2-Layer Network to prove that cifar10 requires deeper networks.
# **Purpose:** Prove that CIFAR-10 needs deep learning

# **Architecture:**
# ```
# Input: 3072 (32√ó32√ó3 flattened)
#   ‚Üì
# Linear(3072 ‚Üí 128) + Sigmoid
#   ‚Üì
# Linear(128 ‚Üí 10) + Softmax
# ```

# **Implementation details:**
# - Use `torch.nn.Linear()` (allowed)
# - Implement sigmoid activation manually: `1 / (1 + torch.exp(-x))`
# - Implement softmax manually: `torch.exp(x) / torch.exp(x).sum()`
# - Implement cross-entropy loss manually

# **Training setup:**
# - Optimizer: SGD (implement manually)
# - Batch size: 1
# - Learning rate: 0.01 or 0.001
# - Epochs: 10-20

import torch
import torch.nn as nn
import torch.optim as optim

class TwoLayerNet(nn.Module):
    def sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))
    
    def softmax(self, x):
        exp_x = torch.exp(x - x.max(dim=1, keepdim=True)[0])  # Subtract max for stability
        return exp_x / exp_x.sum(dim=1, keepdim=True)
    
    def cross_entropy_loss(self, outputs, labels):
        # Convert labels to one-hot encoding
        one_hot_labels = torch.zeros_like(outputs)
        one_hot_labels.scatter_(1, labels.view(-1, 1), 1)
        
        # Compute cross-entropy loss with numerical stability
        # Add small epsilon to prevent log(0)
        log_probs = torch.log(self.softmax(outputs) + 1e-10)
        loss = -torch.sum(one_hot_labels * log_probs) / outputs.size(0)
        return loss
    
    def SGD_Optimizer(self, params, lr):
        with torch.no_grad():
            for param in params:
                if param.grad is not None:
                    param.data = param.data - lr * param.grad
                    param.grad = None  # Reset gradient

    def __init__(self):
        super(TwoLayerNet, self).__init__()
        self.fc1 = nn.Linear(3072, 128)  # Input: 32√ó32√ó3 = 3072
        self.fc2 = nn.Linear(128, 10)    # Output: 10 classes

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.sigmoid(self.fc1(x))  # First layer + sigmoid
        x = self.fc2(x)  # Second layer (logits)
        return x
    


In [3]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TwoLayerNet().to(device)
learning_rate = 0.005  # Reduced from 0.01 to prevent instability
num_epochs = 10



In [4]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Reset gradients before backward pass
        for param in model.parameters():
            param.grad = None
        
        # Forward pass
        outputs = model(images)
        loss = model.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # Backward pass
        loss.backward()
        
        # Update weights using SGD
        model.SGD_Optimizer(model.parameters(), learning_rate)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%')

Epoch [1/10], Loss: 2.0981, Train Acc: 26.84%
Epoch [2/10], Loss: 1.9342, Train Acc: 32.78%
Epoch [3/10], Loss: 1.8707, Train Acc: 35.15%
Epoch [4/10], Loss: 1.8327, Train Acc: 36.53%
Epoch [5/10], Loss: 1.8042, Train Acc: 37.44%
Epoch [6/10], Loss: 1.7817, Train Acc: 38.14%
Epoch [7/10], Loss: 1.7635, Train Acc: 38.73%
Epoch [8/10], Loss: 1.7489, Train Acc: 39.41%
Epoch [9/10], Loss: 1.7358, Train Acc: 39.82%
Epoch [10/10], Loss: 1.7243, Train Acc: 40.27%


In [5]:
# test accuracy in percentage
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')


Test Accuracy: 39.93%


# Part 2, Step 4: Activation Functions Comparison

**Previous Result (Part 1):** 5-Layer CNN with Sigmoid failed completely (10% accuracy - vanishing gradients)

**Purpose:** Test modern activation functions to solve vanishing gradient problem

**New Activations Tested:**
1. **Leaky ReLU** - f(x) = max(x, 0.1x)
   - No saturation for positive values (gradient = 1)
   - Small gradient (0.1) for negative values prevents dying neurons
   - Used for: Conv layers 1, 2, 3

2. **Tanh** - f(x) = (e^x - e^-x) / (e^x + e^-x)
   - Zero-centered output range: (-1, 1)
   - Max gradient = 1 (vs sigmoid's 0.25)
   - Used for: FC layer 1

**Architecture (unchanged):**
- 3 Convolutional layers with MaxPooling (Conv: 3‚Üí16‚Üí32‚Üí64)
- 2 Fully connected layers (FC: 1024‚Üí256‚Üí10)
- Total: 5 parameterized layers, 288,554 parameters

**Expected result:** 60-70% test accuracy (dramatic improvement from 10%)

In [10]:
# 5-Layer CNN Architecture
class FiveLayerCNN(nn.Module):
    def leaky_relu(self, x, negative_slope=0.1):
        """
        Leaky ReLU activation: f(x) = max(x, 0.1*x)
        - For x > 0: output = x (gradient = 1, no vanishing)
        - For x < 0: output = 0.1*x (gradient = 0.1, prevents dying neurons)
        """
        return torch.maximum(x, negative_slope * x)
    
    def tanh(self, x):
        """
        Hyperbolic tangent: f(x) = (e^x - e^-x) / (e^x + e^-x)
        - Output range: (-1, 1)
        - Zero-centered (better than sigmoid)
        - Max gradient = 1 (vs sigmoid's 0.25)
        """
        return torch.tanh(x)
    
    def softmax(self, x):
        exp_x = torch.exp(x - x.max(dim=1, keepdim=True)[0])
        return exp_x / exp_x.sum(dim=1, keepdim=True)
    
    def cross_entropy_loss(self, outputs, labels):
        one_hot_labels = torch.zeros_like(outputs)
        one_hot_labels.scatter_(1, labels.view(-1, 1), 1)
        log_probs = torch.log(self.softmax(outputs) + 1e-10)
        loss = -torch.sum(one_hot_labels * log_probs) / outputs.size(0)
        return loss
    
    def SGD_Optimizer(self, params, lr, momentum=0.0):
        """
        SGD with momentum implementation following equation (8):
        m_{i+1} = Œ± * m_i + g_i
        Œ∏_{i+1} = Œ∏_i - Œ∑ * m_{i+1}
        
        Args:
            params: Model parameters
            lr: Learning rate (Œ∑)
            momentum: Momentum coefficient (Œ±), default=0.0 for vanilla SGD
        """
        # Initialize momentum buffer on first call
        if not hasattr(self, 'momentum_buffer'):
            self.momentum_buffer = {}
        
        with torch.no_grad():
            for param in params:
                if param.grad is not None:
                    # Get parameter id for momentum buffer
                    param_id = id(param)
                    
                    # Initialize momentum to zero if not exists
                    if param_id not in self.momentum_buffer:
                        self.momentum_buffer[param_id] = torch.zeros_like(param.data)
                    
                    # Get current gradient (g_i)
                    grad = param.grad
                    
                    # Update momentum: m_{i+1} = Œ± * m_i + g_i
                    self.momentum_buffer[param_id] = momentum * self.momentum_buffer[param_id] + grad
                    
                    # Update parameters: Œ∏_{i+1} = Œ∏_i - Œ∑ * m_{i+1}
                    param.data = param.data - lr * self.momentum_buffer[param_id]
                    
                    # Reset gradient
                    param.grad = None
    
    def __init__(self):
        super(FiveLayerCNN, self).__init__()
        # Convolutional layers (3 parameterized layers)
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)   # Layer 1: 3‚Üí16 channels
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)          # MaxPool (not parameterized)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)  # Layer 2: 16‚Üí32 channels
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # Layer 3: 32‚Üí64 channels
        
        # Fully connected layers (2 parameterized layers)
        self.fc1 = nn.Linear(64 * 4 * 4, 256)  # Layer 4: After 3 pooling ops, 32√ó32‚Üí4√ó4
        self.fc2 = nn.Linear(256, 10)           # Layer 5: Output layer
    
    def forward(self, x):
        # Input: [batch, 3, 32, 32]
        
        # Conv block 1 - using Leaky ReLU
        x = self.conv1(x)           # [batch, 16, 32, 32]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 16, 16, 16]
        
        # Conv block 2 - using Leaky ReLU
        x = self.conv2(x)           # [batch, 32, 16, 16]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 32, 8, 8]
        
        # Conv block 3 - using Leaky ReLU
        x = self.conv3(x)           # [batch, 64, 8, 8]
        x = self.leaky_relu(x)      # Leaky ReLU activation
        x = self.pool(x)            # [batch, 64, 4, 4]
        
        # Flatten
        x = x.view(x.size(0), -1)   # [batch, 1024]
        
        # Fully connected layers - using Tanh
        x = self.fc1(x)             # [batch, 256]
        x = self.tanh(x)            # Tanh activation
        x = self.fc2(x)             # [batch, 10] - logits
        
        return x

# Create the model
cnn_model = FiveLayerCNN().to(device)
print(f"Model created on device: {device}")
print(f"Total parameters: {sum(p.numel() for p in cnn_model.parameters())}")

Model created on device: cuda
Total parameters: 288554


In [11]:
# Training setup for CNN
cnn_learning_rate = 0.005  # Increased from 0.005 - Leaky ReLU and Tanh have better gradient flow
cnn_num_epochs = 10  # More epochs for deeper network

print(f"Training Configuration:")
print(f"  Learning rate: {cnn_learning_rate}")
print(f"  Epochs: {cnn_num_epochs}")
print(f"  Batch size: {batch_size}")
print(f"  Device: {device}")
print(f"  Activations: Leaky ReLU (conv layers) + Tanh (FC layer)")
print(f"\nStarting training...")

Training Configuration:
  Learning rate: 0.005
  Epochs: 10
  Batch size: 32
  Device: cuda
  Activations: Leaky ReLU (conv layers) + Tanh (FC layer)

Starting training...


In [8]:
# Training loop for CNN
import time

start_time = time.time()

for epoch in range(cnn_num_epochs):
    cnn_model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Reset gradients
        for param in cnn_model.parameters():
            param.grad = None
        
        # Forward pass
        outputs = cnn_model(images)
        loss = cnn_model.cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        
        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
        
        # Backward pass
        loss.backward()
        
        # Update weights
        cnn_model.SGD_Optimizer(cnn_model.parameters(), cnn_learning_rate)
    
    train_acc = 100 * correct_train / total_train
    avg_loss = total_loss / len(train_loader)
    
    # Print progress every epoch
    elapsed_time = time.time() - start_time
    print(f'Epoch [{epoch+1}/{cnn_num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed_time/60:.1f}min')

print(f"\nTotal training time: {(time.time() - start_time)/60:.1f} minutes")

Epoch [1/10], Loss: 2.2294, Train Acc: 17.74%, Time: 0.3min
Epoch [2/10], Loss: 1.9526, Train Acc: 29.70%, Time: 0.6min
Epoch [3/10], Loss: 1.7244, Train Acc: 38.53%, Time: 0.9min
Epoch [4/10], Loss: 1.5282, Train Acc: 45.03%, Time: 1.1min
Epoch [5/10], Loss: 1.4241, Train Acc: 48.70%, Time: 1.4min
Epoch [6/10], Loss: 1.3533, Train Acc: 51.56%, Time: 1.7min
Epoch [7/10], Loss: 1.2895, Train Acc: 53.94%, Time: 2.0min
Epoch [8/10], Loss: 1.2289, Train Acc: 56.32%, Time: 2.3min
Epoch [9/10], Loss: 1.1743, Train Acc: 58.22%, Time: 2.5min
Epoch [10/10], Loss: 1.1242, Train Acc: 60.30%, Time: 2.8min

Total training time: 2.8 minutes


In [9]:
# Evaluate CNN on test set
cnn_model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = cnn_model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

cnn_test_accuracy = 100 * correct / total
print(f'\n{"="*50}')
print(f'5-Layer CNN Test Accuracy: {cnn_test_accuracy:.2f}%')
print(f'{"="*50}')

# Compare with 2-layer network
print(f'\nüìä Comparison:')
print(f'  2-Layer Network: 48.04%')
print(f'  5-Layer CNN:     {cnn_test_accuracy:.2f}%')
print(f'  Improvement:     {cnn_test_accuracy - 48.04:.2f}%')

if cnn_test_accuracy > 50:
    print(f'\n‚úÖ SUCCESS: Deep network achieves >{50}% accuracy!')
    print(f'‚úÖ This proves CIFAR-10 requires deep learning!')


5-Layer CNN Test Accuracy: 58.86%

üìä Comparison:
  2-Layer Network: 48.04%
  5-Layer CNN:     58.86%
  Improvement:     10.82%

‚úÖ SUCCESS: Deep network achieves >50% accuracy!
‚úÖ This proves CIFAR-10 requires deep learning!


---
## ‚úÖ Part 1 Complete!

You now have:
1. ‚úÖ **2-Layer Network** - 48.04% test accuracy (proves shallow networks struggle)
2. ‚úÖ **5-Layer CNN** - Should get 50-60% test accuracy (proves depth helps)

### üéØ Expected Training Time:
- With batch_size=1 and 30 epochs: **~2-3 hours**
- Each epoch processes 50,000 images individually

### üí° Tips:
- The training will take a while - be patient!
- Loss should steadily decrease
- Accuracy should improve over 2-layer baseline
- You can reduce epochs to 20 if you're short on time

### üìù Next Steps (Part 2):
After this training completes, you'll:
1. Test different activation functions (Leaky ReLU, Tanh)
2. Implement mini-batch SGD (batch sizes: 16, 32, 64, 128)
3. Add momentum to the optimizer

---
# Part 2, Step 6: Mini-Batch SGD with Momentum

**Previous Results:**
- Vanilla SGD with batch_size=1: 65.24% test accuracy (29.9 min)
- Mini-batch SGD with batch_size=32: 59.17% test accuracy (2.8 min)

**Purpose:** Add momentum to accelerate learning and smooth optimization

**Momentum Algorithm (Equation 8):**
- **m‚ÇÅ = 0** (initialize momentum to zero)
- **g·µ¢ = (1/b) Œ£ ‚àáŒ∏·µ¢L‚Çñ** (compute gradient)
- **m·µ¢‚Çä‚ÇÅ = Œ±¬∑m·µ¢ + g·µ¢** (update momentum with Œ± = momentum coefficient)
- **Œ∏·µ¢‚Çä‚ÇÅ = Œ∏ - Œ∑¬∑m·µ¢‚Çä‚ÇÅ** (update parameters using momentum)

**How Momentum Helps:**
- Accumulates a moving average of gradients
- Dampens oscillations in directions with high curvature
- Accelerates progress along consistent descent directions
- Think: Rolling ball gaining speed downhill

**Configuration:**
- Architecture: 5-Layer CNN with Leaky ReLU + Tanh
- Batch size: 32 (from previous experiment)
- Learning rate: 0.005
- **Momentum (Œ±): Testing 3 values: 0.7, 0.9, 0.95**
- Epochs: 10

**Expected:** Faster convergence, smoother training, potentially higher accuracy than vanilla SGD. Higher Œ± should provide more smoothing but may overshoot.

In [12]:
# Test multiple momentum values
import time

# Test different momentum coefficients (Œ±)
alpha_values = [0.7, 0.9, 0.95]
momentum_results = {}

learning_rate = 0.005
num_epochs = 10

print(f"Testing momentum with Œ± values: {alpha_values}")
print(f"Each training will take ~2.8 minutes")
print(f"Total expected time: ~8.5 minutes\n")
print("="*70)

Testing momentum with Œ± values: [0.7, 0.9, 0.95]
Each training will take ~2.8 minutes
Total expected time: ~8.5 minutes



In [None]:
# Train and evaluate for each momentum value
for alpha in alpha_values:
    print(f"\n{'='*70}")
    print(f"üîπ Training with Momentum Œ± = {alpha}")
    print(f"{'='*70}\n")
    
    # Create fresh model for this alpha value
    model_momentum = FiveLayerCNN().to(device)
    print(f"Model created with {sum(p.numel() for p in model_momentum.parameters())} parameters")
    print(f"Configuration: lr={learning_rate}, Œ±={alpha}, epochs={num_epochs}, batch_size={batch_size}\n")
    
    # Training loop
    start_time = time.time()
    
    for epoch in range(num_epochs):
        model_momentum.train()
        total_loss = 0
        correct_train = 0
        total_train = 0
        
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            # Reset gradients
            for param in model_momentum.parameters():
                param.grad = None
            
            # Forward pass
            outputs = model_momentum(images)
            loss = model_momentum.cross_entropy_loss(outputs, labels)
            total_loss += loss.item()
            
            # Calculate training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()
            
            # Backward pass
            loss.backward()
            
            # Update weights with momentum
            model_momentum.SGD_Optimizer(model_momentum.parameters(), 
                                         learning_rate, 
                                         momentum=alpha)
        
        train_acc = 100 * correct_train / total_train
        avg_loss = total_loss / len(train_loader)
        
        # Print progress every epoch
        elapsed_time = time.time() - start_time
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%, Time: {elapsed_time/60:.1f}min')
    
    training_time = time.time() - start_time
    print(f"\nTraining time: {training_time/60:.1f} minutes")
    
    # Evaluation
    model_momentum.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model_momentum(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    test_accuracy = 100 * correct / total
    
    # Store results
    momentum_results[alpha] = {
        'test_accuracy': test_accuracy,
        'train_accuracy': train_acc,
        'final_loss': avg_loss,
        'training_time': training_time/60
    }
    
    print(f"\n‚úÖ Test Accuracy with Œ±={alpha}: {test_accuracy:.2f}%")
    print(f"{'='*70}\n")

print("\n" + "="*70)
print("üéØ MOMENTUM EXPERIMENTS COMPLETE")
print("="*70)


üîπ Training with Momentum Œ± = 0.7

Model created with 288554 parameters
Configuration: lr=0.005, Œ±=0.7, epochs=10, batch_size=32

Epoch [1/10], Loss: 1.9255, Train Acc: 30.18%, Time: 0.3min
Epoch [2/10], Loss: 1.4703, Train Acc: 46.94%, Time: 0.6min
Epoch [3/10], Loss: 1.2743, Train Acc: 54.30%, Time: 0.9min


In [None]:
# Display comprehensive results comparison
print("\n" + "="*80)
print("üìä MOMENTUM EXPERIMENTS - COMPREHENSIVE RESULTS")
print("="*80)
print(f"{'Alpha (Œ±)':<12} {'Test Acc':<12} {'Train Acc':<12} {'Loss':<12} {'Time (min)':<12}")
print("-"*80)

best_alpha = None
best_accuracy = 0

for alpha in alpha_values:
    results = momentum_results[alpha]
    print(f"{alpha:<12.2f} {results['test_accuracy']:<12.2f} {results['train_accuracy']:<12.2f} " 
          f"{results['final_loss']:<12.4f} {results['training_time']:<12.1f}")
    
    if results['test_accuracy'] > best_accuracy:
        best_accuracy = results['test_accuracy']
        best_alpha = alpha

print("="*80)
print(f"\nüèÜ Best Momentum Value: Œ± = {best_alpha} with {best_accuracy:.2f}% test accuracy\n")

# Compare with baseline (mini-batch SGD without momentum)
print("="*80)
print("üìà COMPARISON WITH PREVIOUS EXPERIMENTS")
print("="*80)
print(f"{'Experiment':<45} {'Test Accuracy':<15} {'Training Time':<15}")
print("-"*80)
print(f"{'Vanilla SGD (batch_size=1)':<45} {65.24:<15.2f} {'29.9 min':<15}")
print(f"{'Mini-batch SGD (batch_size=32, no momentum)':<45} {cnn_test_accuracy:<15.2f} {'2.8 min':<15}")

for alpha in alpha_values:
    results = momentum_results[alpha]
    print(f"{f'Mini-batch SGD with Momentum (Œ±={alpha})':<45} "
          f"{results['test_accuracy']:<15.2f} {results['training_time']:.1f} min")

print("="*80)

# Analysis
print("\nüìù ANALYSIS:")
print(f"‚Ä¢ Best momentum value: Œ± = {best_alpha} with {best_accuracy:.2f}% test accuracy")
print(f"‚Ä¢ Improvement over no momentum: {best_accuracy - cnn_test_accuracy:+.2f}%")
print(f"‚Ä¢ Momentum helps by accumulating gradients in consistent directions")
print(f"‚Ä¢ Higher Œ± values preserve more history, lower values allow faster adaptation")
if best_accuracy > cnn_test_accuracy:
    print(f"‚Ä¢ ‚úÖ Momentum successfully improved performance!")
print("="*80)