# Exercise 03: Vector Addition üßÆ

Your first real computation on the GPU - adding two arrays!

## Learning Goals
- Pass data to and from the GPU (`cudaMalloc`, `cudaMemcpy`)
- Calculate global thread index
- Handle arrays larger than thread count
- Verify GPU results

## üöÄ Setup

**Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí T4 GPU ‚Üí Save

In [None]:
# Verify CUDA
!nvcc --version
!nvidia-smi --query-gpu=name,memory.total --format=csv,noheader

## üìö Key Concepts

### Memory Management Pattern
```cpp
// 1. Allocate on GPU
float *d_array;
cudaMalloc(&d_array, size);

// 2. Copy data TO GPU
cudaMemcpy(d_array, h_array, size, cudaMemcpyHostToDevice);

// 3. Run kernel
kernel<<<blocks, threads>>>(d_array);

// 4. Copy result FROM GPU
cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);

// 5. Free GPU memory
cudaFree(d_array);
```

### Grid/Block Calculation
```cpp
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;  // Ceiling division
```

### Global Thread Index
```cpp
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {  // Bounds check!
    c[i] = a[i] + b[i];
}
```

## Step 1: Complete the Exercise

Fill in the TODOs below:

In [None]:
%%writefile vector_add.cu
/**
 * Exercise 03: Vector Addition
 */

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

// TODO 1: Write the vector addition kernel
// Each thread adds one element: c[i] = a[i] + b[i]
__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    // Calculate global thread index
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Check bounds and perform addition
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n = 1000000;  // 1 million elements
    size_t size = n * sizeof(float);
    
    printf("Vector Addition: %d elements\n", n);
    
    // Allocate host memory
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    // Initialize input arrays
    for (int i = 0; i < n; i++) {
        h_a[i] = 1.0f;
        h_b[i] = 2.0f;
    }
    
    // TODO 2: Allocate device memory
    float *d_a, *d_b, *d_c;
    CUDA_CHECK(cudaMalloc(&d_a, size));
    CUDA_CHECK(cudaMalloc(&d_b, size));
    CUDA_CHECK(cudaMalloc(&d_c, size));
    
    // TODO 3: Copy input data to device
    CUDA_CHECK(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice));
    
    // TODO 4: Calculate grid dimensions and launch kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    
    printf("Launching: %d blocks, %d threads/block\n", blocksPerGrid, threadsPerBlock);
    
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
    
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
    
    // TODO 5: Copy result back to host
    CUDA_CHECK(cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost));
    
    // Verify result
    bool success = true;
    for (int i = 0; i < n; i++) {
        if (h_c[i] != 3.0f) {
            printf("Error at %d: expected 3.0, got %.2f\n", i, h_c[i]);
            success = false;
            break;
        }
    }
    
    printf("%s\n", success ? "‚úÖ PASSED!" : "‚ùå FAILED!");
    
    // TODO 6: Free device memory
    CUDA_CHECK(cudaFree(d_a));
    CUDA_CHECK(cudaFree(d_b));
    CUDA_CHECK(cudaFree(d_c));
    
    free(h_a); free(h_b); free(h_c);
    
    return 0;
}

In [None]:
# Compile
!nvcc -arch=sm_75 vector_add.cu -o vector_add
print("‚úÖ Compiled!")

In [None]:
# Run
!./vector_add

## üß™ Experiments

Try these variations:

In [None]:
%%writefile vector_add_timed.cu
// Timed version - compare GPU vs CPU
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>

__global__ void vectorAdd(float *a, float *b, float *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) c[i] = a[i] + b[i];
}

void vectorAddCPU(float *a, float *b, float *c, int n) {
    for (int i = 0; i < n; i++) c[i] = a[i] + b[i];
}

int main() {
    int n = 10000000;  // 10 million elements
    size_t size = n * sizeof(float);
    
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);
    
    for (int i = 0; i < n; i++) { h_a[i] = 1.0f; h_b[i] = 2.0f; }
    
    // CPU timing
    clock_t start = clock();
    vectorAddCPU(h_a, h_b, h_c, n);
    double cpu_time = (double)(clock() - start) / CLOCKS_PER_SEC * 1000;
    printf("CPU time: %.3f ms\n", cpu_time);
    
    // GPU timing (including transfers)
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);
    
    cudaEvent_t start_gpu, stop_gpu;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);
    
    cudaEventRecord(start_gpu);
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);
    vectorAdd<<<(n+255)/256, 256>>>(d_a, d_b, d_c, n);
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);
    
    float gpu_time;
    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);
    printf("GPU time (with transfers): %.3f ms\n", gpu_time);
    printf("Speedup: %.2fx\n", cpu_time / gpu_time);
    
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(h_a); free(h_b); free(h_c);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 vector_add_timed.cu -o vector_add_timed && ./vector_add_timed

---

## üîÑ Python Comparison (Optional)

Here's the same operation in Python/Numba:

In [None]:
!pip install numba -q

from numba import cuda
import numpy as np
import time

@cuda.jit
def vector_add_python(a, b, c):
    i = cuda.grid(1)  # Same as blockIdx.x * blockDim.x + threadIdx.x
    if i < c.size:
        c[i] = a[i] + b[i]

# Test
n = 1000000
a = np.ones(n, dtype=np.float32)
b = np.full(n, 2.0, dtype=np.float32)
c = np.zeros(n, dtype=np.float32)

# Move to GPU
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)

# Launch
threads = 256
blocks = (n + threads - 1) // threads
vector_add_python[blocks, threads](d_a, d_b, d_c)  # Note: [blocks, threads] syntax

# Get result
result = d_c.copy_to_host()
print(f"Python result: {result[:10]}")
print(f"‚úÖ All 3.0? {np.allclose(result, 3.0)}")

### Key Difference: C++ vs Python

| C++ | Python (Numba) |
|-----|----------------|
| `cudaMalloc(&d_a, size)` | `d_a = cuda.to_device(a)` |
| `cudaMemcpy(..., HostToDevice)` | (automatic with `to_device`) |
| `kernel<<<blocks, threads>>>()` | `kernel[blocks, threads]()` |
| `cudaMemcpy(..., DeviceToHost)` | `d_a.copy_to_host()` |
| `cudaFree(d_a)` | (automatic garbage collection) |

C++ gives you **more control**, Python is **more convenient**.

---

## ‚û°Ô∏è Next Exercise

[Exercise 04: 2D Grid Indexing](../ex04-2d-indexing/colab-2d-indexing.ipynb) - Work with matrices!