# Profiling and Benchmarking

**Module 07 | Notebook 03**

---

## Objective
By the end of this notebook, you will master:
- Timing code execution accurately
- Using %timeit magic
- Profiling with cProfile and line_profiler
- Memory profiling
- Identifying bottlenecks

In [None]:
import numpy as np
import time
np.set_printoptions(precision=3)

---
## 1. Basic Timing with time module

In [None]:
# Simple timing
arr = np.random.rand(10000000)

start = time.time()
result = arr.sum()
end = time.time()

print(f"Elapsed: {end - start:.6f} seconds")

In [None]:
# Better: time.perf_counter (higher resolution)
start = time.perf_counter()
result = arr.sum()
elapsed = time.perf_counter() - start

print(f"Elapsed: {elapsed:.9f} seconds")

In [None]:
# Run multiple times for stable measurement
def benchmark(func, *args, n_runs=10):
    """Run function multiple times and return stats."""
    times = []
    for _ in range(n_runs):
        start = time.perf_counter()
        func(*args)
        times.append(time.perf_counter() - start)
    
    times = np.array(times)
    return {
        'mean': times.mean(),
        'std': times.std(),
        'min': times.min(),
        'max': times.max()
    }

stats = benchmark(np.sum, arr)
print(f"Mean: {stats['mean']*1000:.3f}ms +/- {stats['std']*1000:.3f}ms")

---
## 2. IPython %timeit Magic

In [None]:
# %timeit: runs code multiple times, reports best result
arr = np.random.rand(1000000)

%timeit arr.sum()

In [None]:
# Compare operations
%timeit arr.sum()     # Built-in
%timeit np.sum(arr)   # Function call

In [None]:
# Control number of runs
%timeit -n 100 -r 5 arr ** 2  # 100 loops, 5 runs

In [None]:
# %%timeit for multi-line code
%%timeit
a = np.random.rand(1000)
b = np.random.rand(1000)
c = a + b

In [None]:
# Get result as object
result = %timeit -o arr.sum()
print(f"Best: {result.best*1000:.4f}ms")
print(f"Worst: {result.worst*1000:.4f}ms")
print(f"All times: {[f'{t*1000:.4f}ms' for t in result.timings[:3]]}")

---
## 3. cProfile for Function Profiling

In [None]:
import cProfile
import pstats
from io import StringIO

def complex_operation():
    """Function with multiple operations."""
    arr = np.random.rand(1000, 1000)
    
    # Various operations
    result = np.dot(arr, arr.T)
    result = np.linalg.eigvals(result[:10, :10])
    result = np.fft.fft(arr[0])
    
    return result

# Profile
profiler = cProfile.Profile()
profiler.enable()

complex_operation()

profiler.disable()

# Print stats
stats = pstats.Stats(profiler)
stats.sort_stats('cumulative')
stats.print_stats(10)  # Top 10 functions

In [None]:
# IPython magic: %prun
%prun complex_operation()

---
## 4. Line Profiler (if installed)

In [None]:
# Line profiler shows time per line
# Install: pip install line_profiler

def analyze_data(n):
    # Create data
    data = np.random.rand(n, n)
    
    # Normalize rows
    row_means = data.mean(axis=1, keepdims=True)
    row_stds = data.std(axis=1, keepdims=True)
    normalized = (data - row_means) / row_stds
    
    # Compute correlation
    corr = np.corrcoef(normalized)
    
    # Get eigenvalues
    eigenvalues = np.linalg.eigvalsh(corr)
    
    return eigenvalues

# Manual line timing
import time

n = 1000

start = time.perf_counter(); data = np.random.rand(n, n); t1 = time.perf_counter() - start
start = time.perf_counter(); row_means = data.mean(axis=1, keepdims=True); t2 = time.perf_counter() - start
start = time.perf_counter(); row_stds = data.std(axis=1, keepdims=True); t3 = time.perf_counter() - start
start = time.perf_counter(); normalized = (data - row_means) / row_stds; t4 = time.perf_counter() - start
start = time.perf_counter(); corr = np.corrcoef(normalized); t5 = time.perf_counter() - start
start = time.perf_counter(); eigenvalues = np.linalg.eigvalsh(corr); t6 = time.perf_counter() - start

print(f"Create data: {t1*1000:.2f}ms")
print(f"Row means: {t2*1000:.2f}ms")
print(f"Row stds: {t3*1000:.2f}ms")
print(f"Normalize: {t4*1000:.2f}ms")
print(f"Correlation: {t5*1000:.2f}ms")
print(f"Eigenvalues: {t6*1000:.2f}ms")

---
## 5. Memory Profiling

In [None]:
# Check array memory usage
arr = np.random.rand(1000, 1000)

print(f"Array shape: {arr.shape}")
print(f"Dtype: {arr.dtype}")
print(f"Size: {arr.size} elements")
print(f"Memory: {arr.nbytes / 1e6:.2f} MB")

In [None]:
# Estimate memory for operations
def estimate_memory(shape, dtype, n_copies=1):
    """Estimate memory needed."""
    itemsize = np.dtype(dtype).itemsize
    total_bytes = np.prod(shape) * itemsize * n_copies
    return total_bytes / 1e6  # MB

# Matrix multiply: A @ B creates result same size as output
n = 5000
print(f"Two {n}x{n} float64 matrices: {estimate_memory((n,n), 'float64', 2):.0f} MB")
print(f"Their product: {estimate_memory((n,n), 'float64', 1):.0f} MB")
print(f"Total: {estimate_memory((n,n), 'float64', 3):.0f} MB")

In [None]:
# Track memory with tracemalloc
import tracemalloc

tracemalloc.start()

# Memory-intensive operation
arr = np.random.rand(2000, 2000)
result = arr @ arr.T

current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"Current memory: {current / 1e6:.1f} MB")
print(f"Peak memory: {peak / 1e6:.1f} MB")

In [None]:
# Memory-profiler (if installed)
# pip install memory_profiler
# Use @profile decorator and run with: python -m memory_profiler script.py

---
## 6. Comparing Implementations

In [None]:
# Framework for comparing implementations
def compare_implementations(*funcs, args=(), n_runs=10):
    """Compare multiple implementations."""
    results = {}
    
    for func in funcs:
        times = []
        for _ in range(n_runs):
            start = time.perf_counter()
            func(*args)
            times.append(time.perf_counter() - start)
        
        results[func.__name__] = {
            'mean': np.mean(times),
            'std': np.std(times)
        }
    
    return results

In [None]:
# Example: Compare normalization approaches
arr = np.random.rand(1000, 1000)

def normalize_loop(arr):
    result = np.empty_like(arr)
    for i in range(arr.shape[0]):
        row = arr[i]
        result[i] = (row - row.mean()) / row.std()
    return result

def normalize_vectorized(arr):
    mean = arr.mean(axis=1, keepdims=True)
    std = arr.std(axis=1, keepdims=True)
    return (arr - mean) / std

def normalize_einsum(arr):
    # Alternative using einsum
    n = arr.shape[1]
    mean = np.einsum('ij->i', arr)[:, None] / n
    return (arr - mean) / np.sqrt(np.einsum('ij,ij->i', arr - mean, arr - mean)[:, None] / n)

results = compare_implementations(
    normalize_loop,
    normalize_vectorized,
    normalize_einsum,
    args=(arr,)
)

for name, stats in results.items():
    print(f"{name}: {stats['mean']*1000:.2f}ms +/- {stats['std']*1000:.2f}ms")

---
## 7. Identifying Bottlenecks

In [None]:
# Pattern: Incremental profiling
import time

class Timer:
    """Context manager for timing code blocks."""
    def __init__(self, name=""):
        self.name = name
        
    def __enter__(self):
        self.start = time.perf_counter()
        return self
    
    def __exit__(self, *args):
        self.elapsed = time.perf_counter() - self.start
        print(f"{self.name}: {self.elapsed*1000:.2f}ms")

# Usage
with Timer("Data creation"):
    data = np.random.rand(2000, 2000)

with Timer("Matrix multiply"):
    result = data @ data.T

with Timer("SVD"):
    u, s, v = np.linalg.svd(result[:100, :100])

In [None]:
# Cumulative timer
class CumulativeTimer:
    """Track time across multiple calls."""
    times = {}
    
    @classmethod
    def start(cls, name):
        if name not in cls.times:
            cls.times[name] = {'total': 0, 'count': 0}
        cls._current_start = time.perf_counter()
        cls._current_name = name
    
    @classmethod
    def stop(cls):
        elapsed = time.perf_counter() - cls._current_start
        cls.times[cls._current_name]['total'] += elapsed
        cls.times[cls._current_name]['count'] += 1
    
    @classmethod
    def report(cls):
        for name, data in cls.times.items():
            avg = data['total'] / data['count'] * 1000
            print(f"{name}: {data['total']*1000:.2f}ms total, {avg:.2f}ms avg ({data['count']} calls)")

# Simulate workload
CumulativeTimer.times = {}  # Reset

for i in range(10):
    CumulativeTimer.start('random')
    data = np.random.rand(500, 500)
    CumulativeTimer.stop()
    
    CumulativeTimer.start('compute')
    result = np.linalg.svd(data)
    CumulativeTimer.stop()

CumulativeTimer.report()

---
## 8. Best Practices Checklist

In [None]:
# Performance checklist
checklist = """
BEFORE OPTIMIZING:
[ ] Is the code correct?
[ ] Is optimization necessary? (80/20 rule)
[ ] Have you profiled to find the bottleneck?

COMMON OPTIMIZATIONS:
[ ] Using vectorized operations (no Python loops)
[ ] Avoiding temporary arrays (in-place, out=)
[ ] Using specialized functions (np.dot, np.linalg)
[ ] Choosing appropriate dtype
[ ] Ensuring contiguous memory layout
[ ] Processing in cache-friendly order

AFTER OPTIMIZING:
[ ] Is the code still correct? (test!)
[ ] Is the speedup significant?
[ ] Is the code still readable?
"""
print(checklist)

---
## Key Points Summary

**Timing:**
- Use `time.perf_counter()` for high resolution
- Run multiple times for stable measurements
- Use `%timeit` in Jupyter for convenience

**Profiling:**
- `cProfile` / `%prun`: function-level timing
- `line_profiler`: line-by-line timing
- `tracemalloc`: memory tracking

**Workflow:**
1. Write correct code first
2. Profile to find bottleneck
3. Optimize the bottleneck
4. Verify correctness
5. Measure improvement

---
## Interview Tips

**Q1: How do you profile NumPy code?**
> Use `%timeit` for quick timing, `cProfile` for function-level analysis, `line_profiler` for line-by-line, and `tracemalloc` for memory. Always run multiple times for stable results.

**Q2: What's the optimization workflow?**
> 1. Write working code first
> 2. Profile to find actual bottleneck (not assumed)
> 3. Optimize the hot path
> 4. Verify correctness with tests
> 5. Measure actual speedup

**Q3: Why profile before optimizing?**
> Most time is spent in small portion of code. Optimizing the wrong part wastes effort. Profiling identifies actual bottlenecks.

**Q4: How do you measure memory usage?**
> Check `arr.nbytes` for array size, use `tracemalloc` for runtime tracking, or `memory_profiler` for line-by-line memory usage.

---
## Practice Exercises

### Exercise 1: Profile and optimize

In [None]:
# Profile this function and identify the bottleneck
def slow_function(n):
    result = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            result[i, j] = np.sqrt(i**2 + j**2)
    return result

# Time it
start = time.perf_counter()
slow_function(200)
print(f"Slow version: {(time.perf_counter()-start)*1000:.1f}ms")

In [None]:
# Solution: Vectorize
def fast_function(n):
    i = np.arange(n)[:, np.newaxis]
    j = np.arange(n)[np.newaxis, :]
    return np.sqrt(i**2 + j**2)

start = time.perf_counter()
fast_function(200)
print(f"Fast version: {(time.perf_counter()-start)*1000:.3f}ms")

# Verify
print(f"Results match: {np.allclose(slow_function(50), fast_function(50))}")

### Exercise 2: Compare memory usage

In [None]:
# Compare memory of float64 vs float32 for a large array
n = 10000

arr64 = np.random.rand(n, n)
arr32 = arr64.astype(np.float32)

print(f"float64: {arr64.nbytes / 1e9:.2f} GB")
print(f"float32: {arr32.nbytes / 1e9:.2f} GB")
print(f"Savings: {(arr64.nbytes - arr32.nbytes) / 1e9:.2f} GB")

---
## Module 07 Complete!

You have mastered Performance Optimization:
- Memory Layout and Views
- Vectorization Best Practices
- Profiling and Benchmarking

**Next Module:** 08_practice_problems - Put it all together with exercises and challenges!