# Vectorization Best Practices

**Module 07 | Notebook 02**

---

## Objective
By the end of this notebook, you will master:
- Advanced vectorization patterns
- Common pitfalls and how to avoid them
- When NOT to vectorize
- Combining operations efficiently
- Real-world optimization examples

In [None]:
import numpy as np
import time
np.set_printoptions(precision=3)

---
## 1. Avoid Temporary Arrays

In [None]:
# Each operation creates temporary array
a = np.random.rand(10000000)
b = np.random.rand(10000000)
c = np.random.rand(10000000)

# Bad: 3 temporary arrays created
start = time.perf_counter()
for _ in range(10):
    result = a + b + c + 1  # temp1=a+b, temp2=temp1+c, temp3=temp2+1
bad_time = time.perf_counter() - start

# Better: use in-place
result = np.empty_like(a)
start = time.perf_counter()
for _ in range(10):
    np.add(a, b, out=result)
    np.add(result, c, out=result)
    np.add(result, 1, out=result)
good_time = time.perf_counter() - start

print(f"With temporaries: {bad_time:.3f}s")
print(f"In-place: {good_time:.3f}s")
print(f"Speedup: {bad_time/good_time:.2f}x")

In [None]:
# numexpr: automatically optimizes expressions
try:
    import numexpr as ne
    
    start = time.perf_counter()
    for _ in range(10):
        result = ne.evaluate('a + b + c + 1')
    ne_time = time.perf_counter() - start
    
    print(f"numexpr: {ne_time:.3f}s")
    print(f"Speedup vs basic: {bad_time/ne_time:.2f}x")
except ImportError:
    print("numexpr not installed")

---
## 2. Use Specialized Functions

In [None]:
# Many operations have optimized versions
arr = np.random.rand(10000000)

# Bad: generic power
start = time.perf_counter()
for _ in range(10):
    result = arr ** 2
pow_time = time.perf_counter() - start

# Good: specialized square
start = time.perf_counter()
for _ in range(10):
    result = np.square(arr)
square_time = time.perf_counter() - start

# Also good: multiply
start = time.perf_counter()
for _ in range(10):
    result = arr * arr
mult_time = time.perf_counter() - start

print(f"arr ** 2: {pow_time:.4f}s")
print(f"np.square: {square_time:.4f}s")
print(f"arr * arr: {mult_time:.4f}s")

In [None]:
# Combined operations
a = np.random.rand(10000, 10000)
b = np.random.rand(10000, 10000)

# Bad: sqrt(sum(...))
start = time.perf_counter()
result = np.sqrt(np.sum((a - b) ** 2))
manual_time = time.perf_counter() - start

# Good: np.linalg.norm
start = time.perf_counter()
result = np.linalg.norm(a - b)
norm_time = time.perf_counter() - start

print(f"Manual: {manual_time:.4f}s")
print(f"np.linalg.norm: {norm_time:.4f}s")

In [None]:
# Other specialized functions:
print("Specialized functions:")
print("- np.dot instead of (a * b).sum()")
print("- np.einsum for complex tensor ops")
print("- np.linalg for linear algebra")
print("- np.fft for Fourier transforms")
print("- scipy.special for special functions")

---
## 3. Boolean Operations Efficiently

In [None]:
arr = np.random.rand(10000000)

# Count elements > 0.5

# Bad: create boolean array first
start = time.perf_counter()
for _ in range(10):
    count = (arr > 0.5).sum()
bool_time = time.perf_counter() - start

# Better: np.count_nonzero
start = time.perf_counter()
for _ in range(10):
    count = np.count_nonzero(arr > 0.5)
count_time = time.perf_counter() - start

print(f"sum of bool: {bool_time:.4f}s")
print(f"count_nonzero: {count_time:.4f}s")

In [None]:
# Check any/all efficiently
arr = np.random.rand(10000000)

# Check if any > 0.5 (short-circuits!)
start = time.perf_counter()
for _ in range(1000):
    has_big = np.any(arr > 0.5)
any_time = time.perf_counter() - start

# Creating boolean array first
start = time.perf_counter()
for _ in range(1000):
    has_big = (arr > 0.5).any()
create_time = time.perf_counter() - start

print(f"np.any(condition): {any_time:.4f}s")
print(f"(condition).any(): {create_time:.4f}s")

---
## 4. Reduce Memory Footprint

In [None]:
# Use appropriate dtype
n = 10000000

# Default float64
arr64 = np.random.rand(n)
print(f"float64: {arr64.nbytes / 1e6:.1f} MB")

# float32 often sufficient
arr32 = arr64.astype(np.float32)
print(f"float32: {arr32.nbytes / 1e6:.1f} MB")

# float16 for storage/transfer
arr16 = arr64.astype(np.float16)
print(f"float16: {arr16.nbytes / 1e6:.1f} MB")

In [None]:
# Performance impact
n = 10000000
arr64 = np.random.rand(n)
arr32 = arr64.astype(np.float32)

start = time.perf_counter()
for _ in range(100):
    _ = arr64.sum()
time64 = time.perf_counter() - start

start = time.perf_counter()
for _ in range(100):
    _ = arr32.sum()
time32 = time.perf_counter() - start

print(f"float64 sum: {time64:.4f}s")
print(f"float32 sum: {time32:.4f}s")
print(f"Speedup: {time64/time32:.2f}x")

In [None]:
# Integer types
# Choose smallest type that fits your data
print("Integer type ranges:")
for dtype in [np.int8, np.int16, np.int32, np.int64]:
    info = np.iinfo(dtype)
    print(f"{dtype.__name__}: {info.min} to {info.max}")

---
## 5. Chunked Processing

In [None]:
# For very large data, process in chunks
def process_chunked(data, chunk_size, func):
    """Apply func to data in chunks."""
    results = []
    for i in range(0, len(data), chunk_size):
        chunk = data[i:i+chunk_size]
        results.append(func(chunk))
    return np.concatenate(results)

# Example: normalize very large array
large = np.random.rand(50000000)

# Full normalization (might cause memory issues)
start = time.perf_counter()
normalized = (large - large.mean()) / large.std()
full_time = time.perf_counter() - start

# Chunked (for illustration - real impl would need global stats)
del normalized
print(f"Full processing: {full_time:.3f}s")

---
## 6. Avoiding Common Pitfalls

In [None]:
# Pitfall 1: Appending in loop
# BAD: creates new array each time
def bad_append():
    result = np.array([])
    for i in range(1000):
        result = np.append(result, i)
    return result

# GOOD: preallocate
def good_preallocate():
    result = np.empty(1000)
    for i in range(1000):
        result[i] = i
    return result

# BEST: list then convert
def best_list():
    result = []
    for i in range(1000):
        result.append(i)
    return np.array(result)

print(f"Bad append: ", end="")
start = time.perf_counter()
bad_append()
print(f"{(time.perf_counter()-start)*1000:.2f}ms")

print(f"Preallocate: ", end="")
start = time.perf_counter()
good_preallocate()
print(f"{(time.perf_counter()-start)*1000:.2f}ms")

print(f"List convert: ", end="")
start = time.perf_counter()
best_list()
print(f"{(time.perf_counter()-start)*1000:.2f}ms")

In [None]:
# Pitfall 2: Copying when unnecessary
arr = np.random.rand(1000000)

# BAD: explicit copy
start = time.perf_counter()
for _ in range(100):
    subset = arr[100:900000].copy()  # Unnecessary copy
    _ = subset.sum()
copy_time = time.perf_counter() - start

# GOOD: use view
start = time.perf_counter()
for _ in range(100):
    subset = arr[100:900000]  # View
    _ = subset.sum()
view_time = time.perf_counter() - start

print(f"With copy: {copy_time:.4f}s")
print(f"With view: {view_time:.4f}s")

In [None]:
# Pitfall 3: Wrong axis
arr = np.random.rand(1000, 100)

# Operations along different axes have different costs
start = time.perf_counter()
for _ in range(1000):
    _ = arr.sum(axis=0)  # Sum columns (100 results)
axis0_time = time.perf_counter() - start

start = time.perf_counter()
for _ in range(1000):
    _ = arr.sum(axis=1)  # Sum rows (1000 results)
axis1_time = time.perf_counter() - start

print(f"Sum axis=0: {axis0_time:.4f}s")
print(f"Sum axis=1: {axis1_time:.4f}s")

---
## 7. When NOT to Vectorize

In [None]:
# Case 1: Very small arrays - overhead dominates
small = np.array([1, 2, 3])

start = time.perf_counter()
for _ in range(100000):
    _ = np.sum(small)
numpy_time = time.perf_counter() - start

start = time.perf_counter()
for _ in range(100000):
    _ = sum(small)
python_time = time.perf_counter() - start

print(f"np.sum on [1,2,3]: {numpy_time:.4f}s")
print(f"Python sum: {python_time:.4f}s")

In [None]:
# Case 2: Early termination needed
# Finding first occurrence is better with loop

arr = np.random.rand(10000000)
arr[100] = 999  # Target near beginning

# Vectorized: scans entire array
start = time.perf_counter()
for _ in range(100):
    idx = np.argmax(arr > 998)
vec_time = time.perf_counter() - start

print(f"Vectorized: {vec_time:.5f}s")
print("Note: Loop could stop at element 100!")

In [None]:
# Case 3: Memory-bound operations on huge arrays
# Vectorization creates temporaries that don't fit in cache
# Consider numba/cython for truly memory-bound cases

---
## Key Points Summary

**Optimize by:**
- Reducing temporary arrays (`out=`, in-place ops)
- Using specialized functions (np.dot, np.linalg)
- Choosing appropriate dtypes
- Processing in chunks for huge data

**Avoid:**
- Appending in loops (preallocate!)
- Unnecessary copies
- Ignoring memory layout

**Don't vectorize when:**
- Arrays are tiny (overhead)
- Early termination helps
- Algorithm is inherently sequential

---
## Interview Tips

**Q1: How do you reduce memory in NumPy computations?**
> Use in-place operations, `out` parameter, smaller dtypes, and process in chunks. Use `del` to free intermediate results.

**Q2: Why is np.append in a loop bad?**
> It creates a new array each iteration, copying all previous data. O(n^2) complexity. Use list.append then np.array, or preallocate.

**Q3: When might a Python loop outperform NumPy?**
> - Tiny arrays (NumPy overhead dominates)
> - Early termination possible
> - Complex conditional logic per element

**Q4: How do you handle arrays too large for memory?**
> Use memory-mapped files (np.memmap), process in chunks, use libraries like Dask, or reduce precision (float32).

---
## Practice Exercises

### Exercise 1: Optimize this computation

In [None]:
# Optimize: compute ((a + b) * c) ** 2
a = np.random.rand(1000000)
b = np.random.rand(1000000)
c = np.random.rand(1000000)

# Slow version (3 temporaries)
result = ((a + b) * c) ** 2


In [None]:
# Solution
a = np.random.rand(1000000)
b = np.random.rand(1000000)
c = np.random.rand(1000000)

# Optimized with out parameter
result = np.empty_like(a)
np.add(a, b, out=result)
np.multiply(result, c, out=result)
np.square(result, out=result)

# Verify
expected = ((a + b) * c) ** 2
print(f"Match: {np.allclose(result, expected)}")

### Exercise 2: Fix the growing array problem

In [None]:
# Fix this code that builds array in loop
def slow_build(n):
    result = np.array([])
    for i in range(n):
        result = np.append(result, i ** 2)
    return result

# Test
start = time.perf_counter()
slow_build(5000)
print(f"Slow: {(time.perf_counter()-start)*1000:.1f}ms")

In [None]:
# Solution 1: Preallocate
def fast_preallocate(n):
    result = np.empty(n)
    for i in range(n):
        result[i] = i ** 2
    return result

# Solution 2: Vectorize entirely
def fastest_vectorized(n):
    return np.arange(n) ** 2

start = time.perf_counter()
fast_preallocate(5000)
print(f"Preallocate: {(time.perf_counter()-start)*1000:.1f}ms")

start = time.perf_counter()
fastest_vectorized(5000)
print(f"Vectorized: {(time.perf_counter()-start)*1000:.3f}ms")

---
## Next Notebook
**03_profiling_and_benchmarking.ipynb** - Measure and identify performance bottlenecks.