# Module 4 - Exercise 1: Model Resource Profiling

## Learning Objectives
- Understand CPU vs GPU tensor operations and memory management
- Profile model training performance across different devices
- Analyze the impact of batch size on training speed
- Monitor memory usage during model creation and training
- Use PyTorch profiler to identify bottlenecks
- Apply optimization techniques to improve performance

In [None]:
# Clone the test repository
!git clone https://github.com/racousin/data_science_practice.git /tmp/tests 2>/dev/null || true

# Import required modules
import sys
sys.path.append('/tmp/tests/tests/python_deep_learning')

# Import the improved test utilities
from test_utils import NotebookTestRunner, create_inline_test
from module4.test_exercise1 import Exercise1Validator, EXERCISE1_SECTIONS

# Create test runner and validator
test_runner = NotebookTestRunner("module4", 1)
validator = Exercise1Validator()

## Environment Setup

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import time
import psutil
import os
from torch.profiler import profile, record_function, ProfilerActivity

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check CUDA availability
device_available = torch.cuda.is_available()
print(f"CUDA available: {device_available}")
if device_available:
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

## Section 1: Device Management

In this section, you'll explore the differences between CPU and GPU tensor operations, including creation, manipulation, and data transfer between devices.

In [None]:
# TODO: Create a tensor of size (1000, 1000) on CPU filled with random values
cpu_tensor = None

print(f"CPU tensor device: {cpu_tensor.device if cpu_tensor is not None else 'Not created'}")
print(f"CPU tensor shape: {cpu_tensor.shape if cpu_tensor is not None else 'Not created'}")

In [None]:
# TODO: If CUDA is available, create the same tensor on GPU
# If not available, set gpu_tensor to None or a message string
gpu_tensor = None

if torch.cuda.is_available():
    print(f"GPU tensor device: {gpu_tensor.device if isinstance(gpu_tensor, torch.Tensor) else 'Not created'}")
    print(f"GPU tensor shape: {gpu_tensor.shape if isinstance(gpu_tensor, torch.Tensor) else 'Not created'}")
else:
    print("No GPU available")

In [None]:
# TODO: Measure the time to transfer a tensor from CPU to GPU (or vice versa)
# Store the transfer time in seconds
transfer_time = None

# Hint: Use time.time() to measure the transfer
# If no GPU, measure CPU to CPU copy time

print(f"Transfer time: {transfer_time:.6f} seconds" if transfer_time else "Not measured")

In [None]:
# Test Section 1: Device Management
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 1: Device Management"]]
test_runner.test_section("Section 1: Device Management", validator, section_tests, locals())

## Section 2: Model Training Performance

Now let's compare the training performance of a simple neural network on CPU vs GPU.

In [None]:
# TODO: Define a simple 3-layer MLP model
# Input: 784 features (like flattened MNIST)
# Hidden layers: 256 and 128 neurons
# Output: 10 classes
# Use ReLU activations

class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        # TODO: Define layers fc1, fc2, fc3
        self.fc1 = None
        self.fc2 = None
        self.fc3 = None
    
    def forward(self, x):
        # TODO: Implement forward pass
        return x

# Test the model
test_model = SimpleModel()
test_input = torch.randn(32, 784)
test_output = test_model(test_input)
print(f"Model output shape: {test_output.shape}")

In [None]:
# Create synthetic dataset
X_train = torch.randn(1000, 784)
y_train = torch.randint(0, 10, (1000,))
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

def train_one_epoch(model, device, loader, optimizer, criterion):
    """Train model for one epoch"""
    model.train()
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

In [None]:
# TODO: Train the model on CPU for 3 epochs and measure the time
cpu_model = SimpleModel()
cpu_optimizer = optim.Adam(cpu_model.parameters())
criterion = nn.CrossEntropyLoss()

# TODO: Measure training time on CPU
cpu_train_time = None

print(f"CPU training time: {cpu_train_time:.4f} seconds" if cpu_train_time else "Not measured")

In [None]:
# TODO: If GPU is available, train the same model on GPU and measure the time
# If not available, set gpu_train_time to None or a message
gpu_train_time = None

if torch.cuda.is_available():
    # TODO: Create model on GPU and train
    pass

print(f"GPU training time: {gpu_train_time}" if gpu_train_time else "Not measured or no GPU")

# Compare if both times are available
if isinstance(cpu_train_time, (float, int)) and isinstance(gpu_train_time, (float, int)):
    speedup = cpu_train_time / gpu_train_time
    print(f"GPU speedup: {speedup:.2f}x")

In [None]:
# Test Section 2: Model Training Performance
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 2: Model Training Performance"]]
test_runner.test_section("Section 2: Model Training Performance", validator, section_tests, locals())

## Section 3: Batch Size Impact

Explore how different batch sizes affect training performance.

In [None]:
# TODO: Test different batch sizes and measure training time per batch
# Test batch sizes: 16, 32, 64, 128, 256
# Store results in a dictionary with batch size as key and time as value

batch_times = {}
batch_sizes = [16, 32, 64, 128, 256]

# TODO: For each batch size, create a DataLoader and measure time for 10 batches
# Store average time per batch in batch_times dictionary

# Visualize results
if batch_times:
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.plot(list(batch_times.keys()), list(batch_times.values()), 'bo-')
    plt.xlabel('Batch Size')
    plt.ylabel('Time per Batch (seconds)')
    plt.title('Batch Size vs Training Time')
    plt.grid(True)
    plt.show()
    
    for size, time in batch_times.items():
        print(f"Batch size {size}: {time:.6f} seconds/batch")

In [None]:
# Test Section 3: Batch Size Impact
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 3: Batch Size Impact"]]
test_runner.test_section("Section 3: Batch Size Impact", validator, section_tests, locals())

## Section 4: Memory Profiling

Monitor memory usage during model creation and training.

In [None]:
# TODO: Measure memory usage before and after creating a large model
# Use psutil to get current process memory usage

process = psutil.Process(os.getpid())

# TODO: Get memory usage before model creation (in MB)
memory_before = None  # Hint: process.memory_info().rss / 1024 / 1024

# Create a larger model
class LargeModel(nn.Module):
    def __init__(self):
        super(LargeModel, self).__init__()
        self.fc1 = nn.Linear(1000, 2000)
        self.fc2 = nn.Linear(2000, 2000)
        self.fc3 = nn.Linear(2000, 1000)
        self.fc4 = nn.Linear(1000, 500)
        self.fc5 = nn.Linear(500, 100)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        return self.fc5(x)

large_model = LargeModel()

# TODO: Get memory usage after model creation (in MB)
memory_after = None

if memory_before and memory_after:
    memory_increase = memory_after - memory_before
    print(f"Memory before: {memory_before:.2f} MB")
    print(f"Memory after: {memory_after:.2f} MB")
    print(f"Memory increase: {memory_increase:.2f} MB")
    
    # Calculate model size
    param_count = sum(p.numel() for p in large_model.parameters())
    param_size = sum(p.numel() * p.element_size() for p in large_model.parameters()) / 1024 / 1024
    print(f"Model parameters: {param_count:,}")
    print(f"Model size (parameters only): {param_size:.2f} MB")

In [None]:
# Test Section 4: Memory Profiling
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 4: Memory Profiling"]]
test_runner.test_section("Section 4: Memory Profiling", validator, section_tests, locals())

## Section 5: Operation-Level Profiling

Profile individual operations to identify bottlenecks.

In [None]:
# TODO: Compare the execution time of different operations
# Operations to test: matmul, conv2d, relu, softmax
# Store times in op_times dictionary

op_times = {}
num_iterations = 100

# Test data
test_tensor = torch.randn(100, 100)
test_image = torch.randn(1, 3, 32, 32)
conv_layer = nn.Conv2d(3, 16, 3)

# TODO: Measure time for matrix multiplication
# op_times['matmul'] = ...

# TODO: Measure time for 2D convolution
# op_times['conv2d'] = ...

# TODO: Measure time for ReLU activation
# op_times['relu'] = ...

# TODO: Measure time for softmax
# op_times['softmax'] = ...

if op_times:
    print("Operation timing (average over 100 iterations):")
    for op, time in sorted(op_times.items(), key=lambda x: x[1]):
        print(f"  {op}: {time*1000:.4f} ms")

In [None]:
# TODO: Use PyTorch profiler to analyze a forward pass
model = SimpleModel()
inputs = torch.randn(32, 784)

# TODO: Profile the model forward pass and store the table output
profile_table = None

# Hint: Use torch.profiler.profile with activities=[ProfilerActivity.CPU]
# Call prof.key_averages().table() to get the profiler output

if profile_table:
    print("PyTorch Profiler Results:")
    print(profile_table)

In [None]:
# Test Section 5: Operation-Level Profiling
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 5: Operation-Level Profiling"]]
test_runner.test_section("Section 5: Operation-Level Profiling", validator, section_tests, locals())

## Section 6: Optimization Techniques

Apply optimization techniques to improve performance.

In [None]:
# TODO: Compare baseline vs optimized implementations
# Store results in optimization_results dictionary with keys 'baseline' and 'optimized'

optimization_results = {}

# Baseline: Regular model forward pass
baseline_model = SimpleModel()
baseline_model.eval()  # Set to evaluation mode
test_batch = torch.randn(100, 784)

# TODO: Measure baseline inference time (100 iterations)
# optimization_results['baseline'] = ...

# Optimized: Use torch.no_grad() and potentially torch.jit.script
# TODO: Measure optimized inference time
# optimization_results['optimized'] = ...

# Hint: Use @torch.no_grad() decorator or with torch.no_grad(): context
# Optional: Try torch.jit.script(model) for additional optimization

if optimization_results:
    baseline_time = optimization_results.get('baseline', 0)
    optimized_time = optimization_results.get('optimized', 0)
    
    print(f"Baseline inference time: {baseline_time:.4f} seconds")
    print(f"Optimized inference time: {optimized_time:.4f} seconds")
    
    if baseline_time > 0 and optimized_time > 0:
        improvement = (baseline_time - optimized_time) / baseline_time * 100
        speedup = baseline_time / optimized_time
        print(f"Performance improvement: {improvement:.1f}%")
        print(f"Speedup: {speedup:.2f}x")

In [None]:
# Test Section 6: Optimization Techniques
section_tests = [(getattr(validator, name), desc) for name, desc in EXERCISE1_SECTIONS["Section 6: Optimization Techniques"]]
test_runner.test_section("Section 6: Optimization Techniques", validator, section_tests, locals())

In [None]:
# Display final summary of all tests
test_runner.final_summary()

## Summary

In this exercise, you've learned:
- How to manage tensors and models across CPU and GPU devices
- The performance differences between CPU and GPU training
- How batch size affects training performance
- How to monitor memory usage during model creation
- How to profile individual operations to find bottlenecks
- Optimization techniques to improve inference performance

These profiling skills are essential for:
- Optimizing model training and inference speed
- Managing memory constraints in production environments
- Identifying and resolving performance bottlenecks
- Making informed decisions about hardware requirements