# Module 3 - Exercise 1: Data Pipeline & Training Loop

## Learning Objectives
- Create simple datasets and split them into train/validation/test sets
- Use DataLoader with different batch sizes
- Implement a complete training loop from scratch
- Compare different optimizers (SGD, Adam)
- Evaluate models on train, validation, and test sets
- Analyze the effect of different learning rates

*Master the fundamentals of PyTorch training pipelines with simple examples.*

In [None]:
# Clone the test repository
!git clone https://github.com/racousin/data_science_practice.git /tmp/tests 2>/dev/null || true

# Import required modules
import sys
sys.path.append('/tmp/tests/tests/python_deep_learning')

# Import the improved test utilities
from test_utils import NotebookTestRunner, create_inline_test
from module3.test_exercise1 import Exercise1Validator, EXERCISE1_SECTIONS

# Create test runner and validator
test_runner = NotebookTestRunner("module3", 1)
validator = Exercise1Validator()

## Environment Setup

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Section 1: Creating a Simple Dataset

Let's start by creating a simple synthetic dataset for regression. We'll generate data from a linear relationship with some noise:
$$y = 2x + 1 + \epsilon$$
where $\epsilon \sim \mathcal{N}(0, 0.1)$

In [None]:
# TODO: Create a simple dataset with 100 samples
# X_train should be a tensor of shape (100, 1) with values from -1 to 1
# y_train should follow the relationship: y = 2*x + 1 + noise

X_train = None  # Shape: (100, 1)
y_train = None  # Shape: (100,) or (100, 1)

# Visualize the dataset
if X_train is not None and y_train is not None:
    plt.figure(figsize=(8, 5))
    plt.scatter(X_train.numpy(), y_train.numpy(), alpha=0.5)
    plt.xlabel('X')
    plt.ylabel('y')
    plt.title('Synthetic Dataset: y = 2x + 1 + noise')
    plt.grid(True)
    plt.show()
    print(f"Dataset created: X_train shape={X_train.shape}, y_train shape={y_train.shape}")

In [None]:
# TODO: Split the dataset into train (60%), validation (20%), and test (20%)
# Use indices 0:60 for train, 60:80 for validation, 80:100 for test

X_train_split = None  # First 60 samples
y_train_split = None  # First 60 labels

X_val = None  # Next 20 samples
y_val = None  # Next 20 labels

X_test = None  # Last 20 samples
y_test = None  # Last 20 labels

if X_train_split is not None:
    print(f"Train set: {len(X_train_split)} samples")
    print(f"Validation set: {len(X_val)} samples")
    print(f"Test set: {len(X_test)} samples")

In [None]:
# Test Section 1: Creating a Simple Dataset
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 1: Creating a Simple Dataset"]]
test_runner.test_section("Section 1: Creating a Simple Dataset", validator, section_tests, locals())

## Section 2: DataLoader with Different Batch Sizes

DataLoaders are essential for:
- Batching data for efficient training
- Shuffling data between epochs
- Parallel data loading

Let's create DataLoaders with different batch sizes to see their effect.

In [None]:
# TODO: Create a TensorDataset from the training split
train_dataset = None

if train_dataset is not None:
    print(f"TensorDataset created with {len(train_dataset)} samples")
    
    # Check a sample
    sample_x, sample_y = train_dataset[0]
    print(f"Sample input shape: {sample_x.shape}, Sample target shape: {sample_y.shape}")

In [None]:
# TODO: Create DataLoaders with different batch sizes
# Create two DataLoaders: one with batch_size=8 and one with batch_size=16
# Set shuffle=True for training

train_loader_8 = None  # Batch size 8
train_loader_16 = None  # Batch size 16

if train_loader_8 is not None:
    print(f"DataLoader with batch size 8:")
    print(f"  Number of batches: {len(train_loader_8)}")
    
if train_loader_16 is not None:
    print(f"\nDataLoader with batch size 16:")
    print(f"  Number of batches: {len(train_loader_16)}")
    
    # Show how batch size affects training
    print("\nBatch size comparison:")
    print(f"  Batch size 8: {60/8:.1f} batches per epoch")
    print(f"  Batch size 16: {60/16:.1f} batches per epoch")

In [None]:
# Test Section 2: DataLoader with Different Batch Sizes
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 2: DataLoader with Different Batch Sizes"]]
test_runner.test_section("Section 2: DataLoader with Different Batch Sizes", validator, section_tests, locals())

## Section 3: Model and Loss Function

For this simple regression task, we'll use:
- A simple linear model (single linear layer)
- Mean Squared Error (MSE) loss

In [None]:
# TODO: Create a simple linear model
# The model should map from 1 input feature to 1 output
# Use nn.Linear or create a simple custom model

model = None

if model is not None:
    print(f"Model created: {model}")
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total parameters: {total_params}")
    
    # Test forward pass
    test_input = torch.randn(1, 1)
    test_output = model(test_input)
    print(f"Test forward pass: input shape={test_input.shape}, output shape={test_output.shape}")

In [None]:
# TODO: Define the loss function
# Use Mean Squared Error (MSE) loss for regression

loss_fn = None

if loss_fn is not None:
    print(f"Loss function: {loss_fn}")
    
    # Test loss computation
    pred = torch.tensor([[1.0]])
    target = torch.tensor([[2.0]])
    test_loss = loss_fn(pred, target)
    print(f"Test MSE loss: pred={pred.item():.2f}, target={target.item():.2f}, loss={test_loss.item():.2f}")

In [None]:
# Test Section 3: Model and Loss Function
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 3: Model and Loss Function"]]
test_runner.test_section("Section 3: Model and Loss Function", validator, section_tests, locals())

## Section 4: Optimizers

Let's create two different optimizers to compare:
- **SGD (Stochastic Gradient Descent)**: Simple, reliable, but can be slow
- **Adam**: Adaptive learning rates, faster convergence, more complex

In [None]:
# Create a fresh model for training
if model is not None:
    model = nn.Linear(1, 1)  # Reset model

# TODO: Create SGD optimizer with learning rate 0.01
optimizer_sgd = None

# TODO: Create Adam optimizer with learning rate 0.001
optimizer_adam = None

if optimizer_sgd is not None:
    print(f"SGD optimizer created with lr=0.01")
    
if optimizer_adam is not None:
    print(f"Adam optimizer created with lr=0.001")
    
print("\nOptimizer comparison:")
print("  SGD: Simple gradient descent, fixed learning rate")
print("  Adam: Adaptive moments, per-parameter learning rates")

In [None]:
# Test Section 4: Optimizers
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 4: Optimizers"]]
test_runner.test_section("Section 4: Optimizers", validator, section_tests, locals())

## Section 5: Training Loop

Now let's implement a complete training loop. The standard PyTorch training loop consists of:
1. Forward pass: compute predictions
2. Compute loss
3. Backward pass: compute gradients
4. Update weights

In [None]:
# TODO: Implement a basic training loop
# Train for 10 epochs using the SGD optimizer
# Track the average loss for each epoch

# Reset model and optimizer
model = nn.Linear(1, 1)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

train_losses = []  # Store average loss per epoch
num_epochs = 10

# TODO: Implement the training loop here
# for epoch in range(num_epochs):
#     epoch_loss = 0.0
#     for batch_x, batch_y in train_loader_8:
#         # 1. Zero gradients
#         # 2. Forward pass
#         # 3. Compute loss
#         # 4. Backward pass
#         # 5. Update weights
#         # 6. Track loss
#     # Calculate and store average epoch loss
#     train_losses.append(average_loss)

# Visualize training progress
if train_losses:
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, marker='o')
    plt.xlabel('Epoch')
    plt.ylabel('Average Loss')
    plt.title('Training Progress')
    plt.grid(True)
    plt.show()
    
    print(f"Initial loss: {train_losses[0]:.4f}")
    print(f"Final loss: {train_losses[-1]:.4f}")
    print(f"Loss reduction: {(1 - train_losses[-1]/train_losses[0]) * 100:.1f}%")

In [None]:
# Test Section 5: Training Loop
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 5: Training Loop"]]
test_runner.test_section("Section 5: Training Loop", validator, section_tests, locals())

## Section 6: Evaluation on Train/Val/Test

It's crucial to evaluate your model on different data splits:
- **Training set**: How well the model fits the training data
- **Validation set**: How well the model generalizes (used for hyperparameter tuning)
- **Test set**: Final evaluation (only used once at the end)

In [None]:
# TODO: Create an evaluation function
def evaluate_model(model, dataloader, loss_fn):
    """
    Evaluate model on a dataset.
    
    Args:
        model: The neural network model
        dataloader: DataLoader for the evaluation data
        loss_fn: Loss function to use
    
    Returns:
        Average loss over the dataset
    """
    # TODO: Implement evaluation
    # 1. Set model to eval mode
    # 2. Disable gradient computation
    # 3. Iterate through dataloader
    # 4. Compute predictions and loss
    # 5. Return average loss
    
    return None  # Return average loss

# Test the function
if evaluate_model(model, train_loader_8, loss_fn) is not None:
    print("Evaluation function implemented!")

In [None]:
# TODO: Train with train/validation monitoring
# Train for 20 epochs and track both training and validation losses

# Create fresh model and data loaders
model = nn.Linear(1, 1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Create validation dataloader
val_dataset = TensorDataset(X_val, y_val) if X_val is not None else None
val_loader = DataLoader(val_dataset, batch_size=20, shuffle=False) if val_dataset else None

final_train_losses = []
final_val_losses = []
num_epochs = 20

# TODO: Implement training with validation monitoring
# for epoch in range(num_epochs):
#     # Training phase
#     model.train()
#     # ... training loop ...
#     
#     # Evaluation phase
#     train_loss = evaluate_model(model, train_loader_8, loss_fn)
#     val_loss = evaluate_model(model, val_loader, loss_fn)
#     
#     final_train_losses.append(train_loss)
#     final_val_losses.append(val_loss)

# Visualize train vs validation losses
if final_train_losses and final_val_losses:
    plt.figure(figsize=(10, 5))
    plt.plot(final_train_losses, label='Train Loss', marker='o')
    plt.plot(final_val_losses, label='Validation Loss', marker='s')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training vs Validation Loss')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    print(f"Final training loss: {final_train_losses[-1]:.4f}")
    print(f"Final validation loss: {final_val_losses[-1]:.4f}")

In [None]:
# TODO: Evaluate on test set
# Create test dataloader and evaluate the final model

test_dataset = TensorDataset(X_test, y_test) if X_test is not None else None
test_loader = DataLoader(test_dataset, batch_size=20, shuffle=False) if test_dataset else None

test_loss = None  # TODO: Evaluate model on test set

if test_loss is not None:
    print(f"Test set loss: {test_loss:.4f}")
    
    # Compare all three
    if final_train_losses and final_val_losses:
        print("\nFinal comparison:")
        print(f"  Train loss: {final_train_losses[-1]:.4f}")
        print(f"  Val loss: {final_val_losses[-1]:.4f}")
        print(f"  Test loss: {test_loss:.4f}")

In [None]:
# Test Section 6: Evaluation on Train/Val/Test
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 6: Evaluation on Train/Val/Test"]]
test_runner.test_section("Section 6: Evaluation on Train/Val/Test", validator, section_tests, locals())

## Section 7: Learning Rate Comparison

The learning rate is one of the most important hyperparameters. Let's compare different learning rates to see their effect on training.

In [None]:
# TODO: Compare different learning rates
# Train models with learning rates: 0.0001, 0.001, 0.01, 0.1
# Track losses for each learning rate

learning_rates = [0.0001, 0.001, 0.01, 0.1]
lr_results = {}  # Dictionary to store results for each learning rate

# TODO: For each learning rate:
# 1. Create a fresh model
# 2. Create optimizer with that learning rate
# 3. Train for a few epochs (e.g., 10)
# 4. Store the loss history

# for lr in learning_rates:
#     model = nn.Linear(1, 1)
#     optimizer = torch.optim.SGD(model.parameters(), lr=lr)
#     losses = []
#     
#     # Training loop...
#     
#     lr_results[lr] = losses

# Visualize learning rate comparison
if lr_results:
    plt.figure(figsize=(12, 6))
    
    for lr, losses in lr_results.items():
        plt.plot(losses, label=f'LR={lr}', marker='o')
    
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Learning Rate Comparison')
    plt.legend()
    plt.grid(True)
    plt.yscale('log')  # Log scale for better visualization
    plt.show()
    
    print("Learning rate analysis:")
    for lr in learning_rates:
        if lr in lr_results and lr_results[lr]:
            initial = lr_results[lr][0]
            final = lr_results[lr][-1]
            print(f"  LR={lr}: Initial loss={initial:.4f}, Final loss={final:.4f}")

In [None]:
# Test Section 7: Learning Rate Comparison
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 7: Learning Rate Comparison"]]
test_runner.test_section("Section 7: Learning Rate Comparison", validator, section_tests, locals())

In [None]:
# Display final summary of all tests
test_runner.final_summary()