# üöÄ Day 2: Element-wise Vector Operations

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sdodlapati3/cuda-lab/blob/main/learning-path/week-03/day-2-elementwise-ops.ipynb)

## Learning Objectives
- Implement arithmetic operations on vectors (add, sub, mul, div)
- Apply math functions (sqrt, exp, log, sin, cos)
- Build neural network activation functions
- Combine operations efficiently

> **Primary Focus:** CUDA C++ code examples first, Python/Numba backup for interactive testing

---

In [None]:
# ‚öôÔ∏è Colab/Local Setup - Run this first!
import subprocess, sys
try:
    import google.colab
    print("üîß Running on Google Colab - Installing dependencies...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
    print("‚úÖ Setup complete!")
except ImportError:
    print("üíª Running locally - make sure you have: pip install numba numpy")

import numpy as np
from numba import cuda
import math
import time

print(f"\nCUDA available: {cuda.is_available()}")
if cuda.is_available():
    device = cuda.get_current_device()
    print(f"Device: {device.name}")

---

## Part 1: Basic Arithmetic Operations

### CUDA C++ Implementation (Primary)

```cpp
// elementwise_ops.cu - Basic vector operations
#include <stdio.h>
#include <cuda_runtime.h>

// Vector Addition
__global__ void vectorAdd(const float* a, const float* b, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = a[i] + b[i];
    }
}

// Vector Subtraction
__global__ void vectorSub(const float* a, const float* b, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = a[i] - b[i];
    }
}

// Element-wise Multiplication (Hadamard product)
__global__ void vectorMul(const float* a, const float* b, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = a[i] * b[i];
    }
}

// Element-wise Division
__global__ void vectorDiv(const float* a, const float* b, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = a[i] / b[i];
    }
}

// Scalar operations
__global__ void scalarMul(const float* a, float scalar, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = a[i] * scalar;
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);
    
    // Allocate and initialize host arrays
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_out = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) {
        h_a[i] = 1.0f;
        h_b[i] = 2.0f;
    }
    
    // Allocate device arrays
    float *d_a, *d_b, *d_out;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_out, size);
    
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    
    // Launch
    int threads = 256;
    int blocks = 256;
    
    vectorAdd<<<blocks, threads>>>(d_a, d_b, d_out, n);
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);
    printf("Add: %f + %f = %f\n", h_a[0], h_b[0], h_out[0]);
    
    // Cleanup
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_out);
    free(h_a); free(h_b); free(h_out);
    return 0;
}
```

### Python/Numba (Optional - Interactive Testing)

In [None]:
# Python equivalents for interactive testing
@cuda.jit
def vector_add(a, b, out, n):
    """out[i] = a[i] + b[i]"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = a[i] + b[i]

@cuda.jit
def vector_sub(a, b, out, n):
    """out[i] = a[i] - b[i]"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = a[i] - b[i]

@cuda.jit
def vector_mul(a, b, out, n):
    """out[i] = a[i] * b[i]"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = a[i] * b[i]

@cuda.jit
def vector_div(a, b, out, n):
    """out[i] = a[i] / b[i]"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = a[i] / b[i]

In [None]:
# Test basic operations
n = 1_000_000
a = np.random.rand(n).astype(np.float32)
b = np.random.rand(n).astype(np.float32) + 0.1  # Avoid div by zero
out = np.zeros(n, dtype=np.float32)

d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_out = cuda.to_device(out)

blocks, threads = 256, 256

# Test each operation
ops = [
    ('Add', vector_add, lambda a, b: a + b),
    ('Sub', vector_sub, lambda a, b: a - b),
    ('Mul', vector_mul, lambda a, b: a * b),
    ('Div', vector_div, lambda a, b: a / b),
]

print(f"Testing with {n:,} elements\n")
for name, kernel, np_op in ops:
    kernel[blocks, threads](d_a, d_b, d_out, n)
    result = d_out.copy_to_host()
    expected = np_op(a, b)
    match = np.allclose(result, expected)
    print(f"{name}: {'‚úì' if match else '‚úó'}")

---

## Part 2: Scalar Operations

### CUDA C++ Implementation (Primary)

```cpp
// Scalar Add: out[i] = a[i] + scalar
__global__ void scalarAdd(const float* a, float scalar, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = a[i] + scalar;
    }
}

// Scalar Multiply: out[i] = a[i] * scalar
__global__ void scalarMul(const float* a, float scalar, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = a[i] * scalar;
    }
}

// Scalar Power: out[i] = a[i] ^ power
__global__ void scalarPow(const float* a, float power, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = powf(a[i], power);
    }
}
```

### Python/Numba (Optional)

In [None]:
@cuda.jit
def scalar_add(a, scalar, out, n):
    """out[i] = a[i] + scalar"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = a[i] + scalar

@cuda.jit
def scalar_mul(a, scalar, out, n):
    """out[i] = a[i] * scalar"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = a[i] * scalar

@cuda.jit
def scalar_pow(a, power, out, n):
    """out[i] = a[i] ** power"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = a[i] ** power

In [None]:
# Test scalar operations
scalar_mul[blocks, threads](d_a, 2.5, d_out, n)
result = d_out.copy_to_host()
expected = a * 2.5
print(f"Scalar multiply by 2.5: {'‚úì' if np.allclose(result, expected) else '‚úó'}")

scalar_pow[blocks, threads](d_a, 2.0, d_out, n)
result = d_out.copy_to_host()
expected = a ** 2.0
print(f"Scalar power of 2: {'‚úì' if np.allclose(result, expected) else '‚úó'}")

---

## Part 3: Math Functions

### CUDA C++ Math Functions

| CUDA C++ | Python/Numba | Description |
|----------|--------------|-------------|
| `sqrtf(x)` | `math.sqrt(x)` | Square root |
| `sinf(x)` | `math.sin(x)` | Sine |
| `cosf(x)` | `math.cos(x)` | Cosine |
| `tanf(x)` | `math.tan(x)` | Tangent |
| `expf(x)` | `math.exp(x)` | Exponential e^x |
| `logf(x)` | `math.log(x)` | Natural log |
| `log10f(x)` | `math.log10(x)` | Log base 10 |
| `fabsf(x)` | `math.fabs(x)` | Absolute value |
| `floorf(x)` | `math.floor(x)` | Floor |
| `ceilf(x)` | `math.ceil(x)` | Ceiling |
| `powf(x,y)` | `x ** y` | Power |

### CUDA C++ Implementation (Primary)

```cpp
// math_functions.cu
#include <math.h>

__global__ void vectorSqrt(const float* a, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = sqrtf(a[i]);
    }
}

__global__ void vectorExp(const float* a, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = expf(a[i]);
    }
}

__global__ void vectorLog(const float* a, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = logf(a[i]);
    }
}

__global__ void vectorSin(const float* a, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = sinf(a[i]);
    }
}

__global__ void vectorCos(const float* a, float* out, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        out[i] = cosf(a[i]);
    }
}
```

### Python/Numba (Optional)

In [None]:
@cuda.jit
def vector_sqrt(a, out, n):
    """out[i] = sqrt(a[i])"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = math.sqrt(a[i])

@cuda.jit
def vector_exp(a, out, n):
    """out[i] = exp(a[i])"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = math.exp(a[i])

@cuda.jit
def vector_log(a, out, n):
    """out[i] = log(a[i])"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = math.log(a[i])

@cuda.jit
def vector_sin(a, out, n):
    """out[i] = sin(a[i])"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = math.sin(a[i])

@cuda.jit
def vector_cos(a, out, n):
    """out[i] = cos(a[i])"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = math.cos(a[i])

In [None]:
# Test math functions
a_pos = np.abs(a) + 0.01  # Positive values for sqrt/log
d_a_pos = cuda.to_device(a_pos)

math_ops = [
    ('sqrt', vector_sqrt, np.sqrt, d_a_pos, a_pos),
    ('exp', vector_exp, np.exp, d_a, a * 0.1),  # Scale down to avoid overflow
    ('log', vector_log, np.log, d_a_pos, a_pos),
    ('sin', vector_sin, np.sin, d_a, a),
    ('cos', vector_cos, np.cos, d_a, a),
]

print("Math function tests:")
for name, kernel, np_fn, d_input, h_input in math_ops:
    kernel[blocks, threads](d_input, d_out, n)
    result = d_out.copy_to_host()
    expected = np_fn(h_input)
    match = np.allclose(result, expected, rtol=1e-5)
    print(f"  {name}: {'‚úì' if match else '‚úó'}")

---

## Part 4: Compound Operations

### Combining Multiple Operations

In [None]:
@cuda.jit
def vector_normalize(a, out, n):
    """Normalize to [0, 1] assuming input in [0, max_val]"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        # Sigmoid-like normalization
        out[i] = 1.0 / (1.0 + math.exp(-a[i]))

@cuda.jit
def vector_relu(a, out, n):
    """ReLU activation: max(0, x)"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = max(0.0, a[i])

@cuda.jit
def vector_leaky_relu(a, out, alpha, n):
    """Leaky ReLU: max(alpha*x, x)"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        x = a[i]
        out[i] = x if x > 0 else alpha * x

@cuda.jit
def vector_tanh(a, out, n):
    """Hyperbolic tangent activation"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = math.tanh(a[i])

In [None]:
# Test activation functions
a_centered = (a - 0.5) * 4  # Values around 0
d_a_centered = cuda.to_device(a_centered)

# Sigmoid
vector_normalize[blocks, threads](d_a_centered, d_out, n)
result = d_out.copy_to_host()
expected = 1 / (1 + np.exp(-a_centered))
print(f"Sigmoid: {'‚úì' if np.allclose(result, expected, rtol=1e-5) else '‚úó'}")

# ReLU
vector_relu[blocks, threads](d_a_centered, d_out, n)
result = d_out.copy_to_host()
expected = np.maximum(0, a_centered)
print(f"ReLU: {'‚úì' if np.allclose(result, expected) else '‚úó'}")

# Tanh
vector_tanh[blocks, threads](d_a_centered, d_out, n)
result = d_out.copy_to_host()
expected = np.tanh(a_centered)
print(f"Tanh: {'‚úì' if np.allclose(result, expected, rtol=1e-5) else '‚úó'}")

---

## Part 5: Performance Analysis

### Memory Bandwidth vs Compute Bound

```
Memory Bandwidth Bound:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚Ä¢ Simple ops like add, mul
‚Ä¢ Read 2 values, write 1 ‚Üí mostly waiting on memory
‚Ä¢ GPU advantage: Higher memory bandwidth

Compute Bound:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚Ä¢ Complex ops like exp, sin, sqrt
‚Ä¢ Many cycles per element
‚Ä¢ GPU advantage: Massive parallelism
```

In [None]:
def benchmark_operation(kernel, np_op, a, b, name, iterations=100):
    """Benchmark GPU kernel vs NumPy."""
    n = len(a)
    out = np.zeros(n, dtype=np.float32)
    
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b) if b is not None else None
    d_out = cuda.to_device(out)
    
    blocks, threads = 256, 256
    
    # Warmup
    if d_b is not None:
        kernel[blocks, threads](d_a, d_b, d_out, n)
    else:
        kernel[blocks, threads](d_a, d_out, n)
    cuda.synchronize()
    
    # GPU benchmark
    start = time.perf_counter()
    for _ in range(iterations):
        if d_b is not None:
            kernel[blocks, threads](d_a, d_b, d_out, n)
        else:
            kernel[blocks, threads](d_a, d_out, n)
    cuda.synchronize()
    gpu_time = (time.perf_counter() - start) / iterations * 1000
    
    # NumPy benchmark
    start = time.perf_counter()
    for _ in range(iterations):
        if b is not None:
            _ = np_op(a, b)
        else:
            _ = np_op(a)
    numpy_time = (time.perf_counter() - start) / iterations * 1000
    
    speedup = numpy_time / gpu_time
    return gpu_time, numpy_time, speedup

In [None]:
# Comprehensive benchmark
n = 10_000_000
a = np.random.rand(n).astype(np.float32)
b = np.random.rand(n).astype(np.float32) + 0.1

print(f"Benchmarking with N = {n:,} elements\n")
print(f"{'Operation':<15} | {'GPU (ms)':<10} | {'NumPy (ms)':<10} | {'Speedup':<10}")
print("-" * 55)

benchmarks = [
    ('Add', vector_add, lambda x, y: x + y, b),
    ('Mul', vector_mul, lambda x, y: x * y, b),
    ('Div', vector_div, lambda x, y: x / y, b),
    ('Sqrt', vector_sqrt, np.sqrt, None),
    ('Exp', vector_exp, np.exp, None),
    ('Log', vector_log, np.log, None),
    ('Sin', vector_sin, np.sin, None),
    ('Cos', vector_cos, np.cos, None),
]

for name, kernel, np_op, b_arr in benchmarks:
    a_input = np.abs(a) + 0.01 if name in ['Sqrt', 'Log'] else a
    gpu_t, np_t, speedup = benchmark_operation(kernel, np_op, a_input, b_arr, name)
    print(f"{name:<15} | {gpu_t:<10.3f} | {np_t:<10.3f} | {speedup:<10.1f}x")

### Observations

```
Memory-bound ops (add, mul):
‚Ä¢ Moderate speedup (5-10x)
‚Ä¢ Limited by memory bandwidth
‚Ä¢ GPU has higher bandwidth than CPU

Compute-bound ops (exp, sin, sqrt):
‚Ä¢ Higher speedup (10-50x)
‚Ä¢ GPU excels at parallel math
‚Ä¢ More compute per memory access
```

---

## Part 6: In-Place Operations

In [None]:
@cuda.jit
def inplace_add(a, b, n):
    """a[i] += b[i] (modifies a in-place)"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        a[i] += b[i]

@cuda.jit
def inplace_scale(a, scalar, n):
    """a[i] *= scalar (modifies a in-place)"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        a[i] *= scalar

@cuda.jit
def inplace_clamp(a, min_val, max_val, n):
    """Clamp values to [min_val, max_val] in-place"""
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        a[i] = max(min_val, min(max_val, a[i]))

In [None]:
# Test in-place operations
test_a = np.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float32)
test_b = np.array([0.5, 0.5, 0.5, 0.5, 0.5], dtype=np.float32)

d_test_a = cuda.to_device(test_a.copy())
d_test_b = cuda.to_device(test_b)

print(f"Original a: {test_a}")

inplace_add[1, 32](d_test_a, d_test_b, len(test_a))
print(f"After a += b: {d_test_a.copy_to_host()}")

inplace_scale[1, 32](d_test_a, 2.0, len(test_a))
print(f"After a *= 2: {d_test_a.copy_to_host()}")

inplace_clamp[1, 32](d_test_a, 2.0, 8.0, len(test_a))
print(f"After clamp[2,8]: {d_test_a.copy_to_host()}")

---

## Exercises

### Exercise 1: Vector Absolute Value

In [None]:
# TODO: Implement vector absolute value
@cuda.jit
def vector_abs(a, out, n):
    """out[i] = |a[i]|"""
    # Hint: Use math.fabs(x)
    pass

# Test with [-3, -1, 0, 1, 3]
# Expected: [3, 1, 0, 1, 3]

### Exercise 2: Softplus Activation

In [None]:
# TODO: Implement softplus: log(1 + exp(x))
@cuda.jit
def vector_softplus(a, out, n):
    """Softplus activation: out[i] = log(1 + exp(a[i]))"""
    # Hint: For numerical stability, use:
    # if x > 20: return x (avoid exp overflow)
    # else: return log(1 + exp(x))
    pass

### Exercise 3: Polynomial Evaluation

In [None]:
# TODO: Evaluate polynomial a*x^2 + b*x + c
@cuda.jit
def polynomial_eval(x, a_coef, b_coef, c_coef, out, n):
    """Evaluate ax^2 + bx + c for each element."""
    pass

# Test: x = [0, 1, 2, 3], a=1, b=2, c=1
# Expected (x^2 + 2x + 1): [1, 4, 9, 16]

### Exercise 4: Distance from Origin (2D vectors)

In [None]:
# TODO: Compute distance from origin for 2D points
@cuda.jit
def vector_distance_2d(x, y, dist, n):
    """dist[i] = sqrt(x[i]^2 + y[i]^2)"""
    pass

# Test: x = [3, 0, 4], y = [4, 5, 3]
# Expected: [5, 5, 5] (3-4-5 triangles!)

---

## Summary

### Element-wise Operation Template

```python
@cuda.jit
def vector_operation(a, b, out, n):  # or (a, out, n) for unary
    tid = cuda.grid(1)
    stride = cuda.gridsize(1)
    
    for i in range(tid, n, stride):
        out[i] = operation(a[i], b[i])  # or operation(a[i])
```

### Math Functions Available
- Basic: `+`, `-`, `*`, `/`, `**`, `%`
- Math: `sqrt`, `exp`, `log`, `log10`
- Trig: `sin`, `cos`, `tan`, `asin`, `acos`, `atan`
- Other: `fabs`, `floor`, `ceil`, `fmod`, `tanh`

### Performance Notes
- Simple ops: Memory bandwidth bound
- Complex ops: Compute bound, higher speedups
- Always use grid-stride for flexibility

---

## Next Steps

üìã **Day 3:** SAXPY and BLAS-like operations

We'll combine multiple operations and learn about the BLAS standard!