In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("‚ö†Ô∏è  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Capstone Project: GPU-Accelerated Image Convolution

### Project Requirements

Build a complete image convolution system that:
1. Loads images from disk
2. Applies various filters (blur, sharpen, edge detect)
3. Uses optimized CUDA kernels
4. Demonstrates performance gains over CPU
5. Uses streams for overlapped processing of multiple images

### Skills Demonstrated

```
‚úì Week 1-2: Thread indexing, 2D grids
‚úì Week 3-4: Shared memory, memory coalescing
‚úì Week 5:   Scan for histogram normalization
‚úì Week 6:   2D tiled algorithms
‚úì Week 7:   Occupancy and memory optimization
‚úì Week 8:   Profiling and roofline analysis
‚úì Week 9:   Streams for multi-image processing
‚úì Week 10:  CUDA Graphs for repeated filters
‚úì Week 11:  Cooperative groups for reductions
‚úì Week 12:  Multi-GPU for large images
```

---

## Part 1: Core Convolution Kernel

### üî∑ CUDA C++ Implementation (Primary)

```cpp
// convolution.cu - Complete image convolution system
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16
#define FILTER_RADIUS 2
#define FILTER_SIZE (2 * FILTER_RADIUS + 1)

// Constant memory for filter coefficients
__constant__ float c_filter[FILTER_SIZE * FILTER_SIZE];

// ============================================================
// Naive Convolution (Baseline)
// ============================================================
__global__ void convNaive(
    float* output, const float* input,
    int width, int height
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x >= width || y >= height) return;
    
    float sum = 0.0f;
    
    for (int fy = -FILTER_RADIUS; fy <= FILTER_RADIUS; fy++) {
        for (int fx = -FILTER_RADIUS; fx <= FILTER_RADIUS; fx++) {
            int ix = min(max(x + fx, 0), width - 1);
            int iy = min(max(y + fy, 0), height - 1);
            
            int fidx = (fy + FILTER_RADIUS) * FILTER_SIZE + 
                       (fx + FILTER_RADIUS);
            
            sum += input[iy * width + ix] * c_filter[fidx];
        }
    }
    
    output[y * width + x] = sum;
}

// ============================================================
// Optimized Convolution with Shared Memory
// ============================================================
__global__ void convShared(
    float* output, const float* input,
    int width, int height
) {
    // Shared memory tile with apron
    const int TILE_W = BLOCK_SIZE + 2 * FILTER_RADIUS;
    __shared__ float smem[TILE_W][TILE_W];
    
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int x = blockIdx.x * BLOCK_SIZE + tx;
    int y = blockIdx.y * BLOCK_SIZE + ty;
    
    // Calculate source positions including apron
    int srcX = x - FILTER_RADIUS;
    int srcY = y - FILTER_RADIUS;
    
    // Load main tile element
    if (srcX >= 0 && srcX < width && srcY >= 0 && srcY < height) {
        smem[ty][tx] = input[srcY * width + srcX];
    } else {
        smem[ty][tx] = 0.0f;
    }
    
    // Load additional elements for apron (right and bottom edges)
    if (tx < 2 * FILTER_RADIUS) {
        int ax = srcX + BLOCK_SIZE;
        if (ax >= 0 && ax < width && srcY >= 0 && srcY < height) {
            smem[ty][tx + BLOCK_SIZE] = input[srcY * width + ax];
        } else {
            smem[ty][tx + BLOCK_SIZE] = 0.0f;
        }
    }
    
    if (ty < 2 * FILTER_RADIUS) {
        int ay = srcY + BLOCK_SIZE;
        if (srcX >= 0 && srcX < width && ay >= 0 && ay < height) {
            smem[ty + BLOCK_SIZE][tx] = input[ay * width + srcX];
        } else {
            smem[ty + BLOCK_SIZE][tx] = 0.0f;
        }
    }
    
    if (tx < 2 * FILTER_RADIUS && ty < 2 * FILTER_RADIUS) {
        int ax = srcX + BLOCK_SIZE;
        int ay = srcY + BLOCK_SIZE;
        if (ax >= 0 && ax < width && ay >= 0 && ay < height) {
            smem[ty + BLOCK_SIZE][tx + BLOCK_SIZE] = input[ay * width + ax];
        } else {
            smem[ty + BLOCK_SIZE][tx + BLOCK_SIZE] = 0.0f;
        }
    }
    
    __syncthreads();
    
    // Compute convolution from shared memory
    if (x < width && y < height) {
        float sum = 0.0f;
        
        #pragma unroll
        for (int fy = 0; fy < FILTER_SIZE; fy++) {
            #pragma unroll
            for (int fx = 0; fx < FILTER_SIZE; fx++) {
                sum += smem[ty + fy][tx + fx] * 
                       c_filter[fy * FILTER_SIZE + fx];
            }
        }
        
        output[y * width + x] = sum;
    }
}
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile convolution.cu
// convolution.cu - Complete image convolution system
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16
#define FILTER_RADIUS 2
#define FILTER_SIZE (2 * FILTER_RADIUS + 1)

// Constant memory for filter coefficients
__constant__ float c_filter[FILTER_SIZE * FILTER_SIZE];

// ============================================================
// Naive Convolution (Baseline)
// ============================================================
__global__ void convNaive(
    float* output, const float* input,
    int width, int height
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x >= width || y >= height) return;
    
    float sum = 0.0f;
    
    for (int fy = -FILTER_RADIUS; fy <= FILTER_RADIUS; fy++) {
        for (int fx = -FILTER_RADIUS; fx <= FILTER_RADIUS; fx++) {
            int ix = min(max(x + fx, 0), width - 1);
            int iy = min(max(y + fy, 0), height - 1);
            
            int fidx = (fy + FILTER_RADIUS) * FILTER_SIZE + 
                       (fx + FILTER_RADIUS);
            
            sum += input[iy * width + ix] * c_filter[fidx];
        }
    }
    
    output[y * width + x] = sum;
}

// ============================================================
// Optimized Convolution with Shared Memory
// ============================================================
__global__ void convShared(
    float* output, const float* input,
    int width, int height
) {
    // Shared memory tile with apron
    const int TILE_W = BLOCK_SIZE + 2 * FILTER_RADIUS;
    __shared__ float smem[TILE_W][TILE_W];
    
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int x = blockIdx.x * BLOCK_SIZE + tx;
    int y = blockIdx.y * BLOCK_SIZE + ty;
    
    // Calculate source positions including apron
    int srcX = x - FILTER_RADIUS;
    int srcY = y - FILTER_RADIUS;
    
    // Load main tile element
    if (srcX >= 0 && srcX < width && srcY >= 0 && srcY < height) {
        smem[ty][tx] = input[srcY * width + srcX];
    } else {
        smem[ty][tx] = 0.0f;
    }
    
    // Load additional elements for apron (right and bottom edges)
    if (tx < 2 * FILTER_RADIUS) {
        int ax = srcX + BLOCK_SIZE;
        if (ax >= 0 && ax < width && srcY >= 0 && srcY < height) {
            smem[ty][tx + BLOCK_SIZE] = input[srcY * width + ax];
        } else {
            smem[ty][tx + BLOCK_SIZE] = 0.0f;
        }
    }
    
    if (ty < 2 * FILTER_RADIUS) {
        int ay = srcY + BLOCK_SIZE;
        if (srcX >= 0 && srcX < width && ay >= 0 && ay < height) {
            smem[ty + BLOCK_SIZE][tx] = input[ay * width + srcX];
        } else {
            smem[ty + BLOCK_SIZE][tx] = 0.0f;
        }
    }
    
    if (tx < 2 * FILTER_RADIUS && ty < 2 * FILTER_RADIUS) {
        int ax = srcX + BLOCK_SIZE;
        int ay = srcY + BLOCK_SIZE;
        if (ax >= 0 && ax < width && ay >= 0 && ay < height) {
            smem[ty + BLOCK_SIZE][tx + BLOCK_SIZE] = input[ay * width + ax];
        } else {
            smem[ty + BLOCK_SIZE][tx + BLOCK_SIZE] = 0.0f;
        }
    }
    
    __syncthreads();
    
    // Compute convolution from shared memory
    if (x < width && y < height) {
        float sum = 0.0f;
        
        #pragma unroll
        for (int fy = 0; fy < FILTER_SIZE; fy++) {
            #pragma unroll
            for (int fx = 0; fx < FILTER_SIZE; fx++) {
                sum += smem[ty + fy][tx + fx] * 
                       c_filter[fy * FILTER_SIZE + fx];
            }
        }
        
        output[y * width + x] = sum;
    }
}

// ============================================================
// Filter Definitions
// ============================================================

void createGaussianBlur(float* filter) {
    float kernel[] = {
        1.0f/256, 4.0f/256,  7.0f/256,  4.0f/256, 1.0f/256,
        4.0f/256, 16.0f/256, 26.0f/256, 16.0f/256, 4.0f/256,
        7.0f/256, 26.0f/256, 41.0f/256, 26.0f/256, 7.0f/256,
        4.0f/256, 16.0f/256, 26.0f/256, 16.0f/256, 4.0f/256,
        1.0f/256, 4.0f/256,  7.0f/256,  4.0f/256, 1.0f/256
    };
    memcpy(filter, kernel, FILTER_SIZE * FILTER_SIZE * sizeof(float));
}

void createSharpen(float* filter) {
    float kernel[] = {
         0,  0, -1,  0,  0,
         0, -1, -1, -1,  0,
        -1, -1, 13, -1, -1,
         0, -1, -1, -1,  0,
         0,  0, -1,  0,  0
    };
    memcpy(filter, kernel, FILTER_SIZE * FILTER_SIZE * sizeof(float));
}

void createEdgeDetect(float* filter) {
    float kernel[] = {
        -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1,
        -1, -1, 24, -1, -1,
        -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1
    };
    memcpy(filter, kernel, FILTER_SIZE * FILTER_SIZE * sizeof(float));
}

// ============================================================
// Main Program
// ============================================================

int main(int argc, char** argv) {
    const int WIDTH = 4096;
    const int HEIGHT = 4096;
    size_t imageBytes = WIDTH * HEIGHT * sizeof(float);
    
    // Allocate host memory (pinned for async transfers)
    float *h_input, *h_output;
    cudaMallocHost(&h_input, imageBytes);
    cudaMallocHost(&h_output, imageBytes);
    
    // Initialize with test pattern
    for (int i = 0; i < WIDTH * HEIGHT; i++) {
        h_input[i] = (float)(i % 256) / 255.0f;
    }
    
    // Allocate device memory
    float *d_input, *d_output;
    cudaMalloc(&d_input, imageBytes);
    cudaMalloc(&d_output, imageBytes);
    
    // Create and upload Gaussian blur filter
    float h_filter[FILTER_SIZE * FILTER_SIZE];
    createGaussianBlur(h_filter);
    cudaMemcpyToSymbol(c_filter, h_filter, 
                       FILTER_SIZE * FILTER_SIZE * sizeof(float));
    
    // Copy input to device
    cudaMemcpy(d_input, h_input, imageBytes, cudaMemcpyHostToDevice);
    
    // Setup kernel launch parameters
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((WIDTH + BLOCK_SIZE - 1) / BLOCK_SIZE,
              (HEIGHT + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // Warmup
    convShared<<<grid, block>>>(d_output, d_input, WIDTH, HEIGHT);
    cudaDeviceSynchronize();
    
    // =========================================
    // Benchmark: Naive vs Optimized
    // =========================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    const int RUNS = 100;
    float ms;
    
    // Naive kernel
    cudaEventRecord(start);
    for (int i = 0; i < RUNS; i++) {
        convNaive<<<grid, block>>>(d_output, d_input, WIDTH, HEIGHT);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    
    float naiveTime = ms / RUNS;
    printf("Naive convolution:     %.2f ms\n", naiveTime);
    
    // Optimized kernel
    cudaEventRecord(start);
    for (int i = 0; i < RUNS; i++) {
        convShared<<<grid, block>>>(d_output, d_input, WIDTH, HEIGHT);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    
    float optTime = ms / RUNS;
    printf("Optimized convolution: %.2f ms\n", optTime);
    printf("Speedup: %.2fx\n", naiveTime / optTime);
    
    // Calculate effective bandwidth
    float bandwidth = 2.0f * imageBytes / (optTime / 1000.0f) / 1e9;
    printf("Effective bandwidth:   %.2f GB/s\n", bandwidth);
    
    // Cleanup
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFreeHost(h_input);
    cudaFreeHost(h_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    printf("\nCapstone project complete!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -O3 -o convolution convolution.cu
!./convolution

---

## Part 2: Filter Definitions and Host Code

### üî∑ CUDA C++ Implementation (Primary)

```cpp
// ============================================================
// Filter Definitions
// ============================================================

void createGaussianBlur(float* filter) {
    float kernel[] = {
        1.0f/256, 4.0f/256,  7.0f/256,  4.0f/256, 1.0f/256,
        4.0f/256, 16.0f/256, 26.0f/256, 16.0f/256, 4.0f/256,
        7.0f/256, 26.0f/256, 41.0f/256, 26.0f/256, 7.0f/256,
        4.0f/256, 16.0f/256, 26.0f/256, 16.0f/256, 4.0f/256,
        1.0f/256, 4.0f/256,  7.0f/256,  4.0f/256, 1.0f/256
    };
    memcpy(filter, kernel, FILTER_SIZE * FILTER_SIZE * sizeof(float));
}

void createSharpen(float* filter) {
    float kernel[] = {
         0,  0, -1,  0,  0,
         0, -1, -1, -1,  0,
        -1, -1, 13, -1, -1,
         0, -1, -1, -1,  0,
         0,  0, -1,  0,  0
    };
    memcpy(filter, kernel, FILTER_SIZE * FILTER_SIZE * sizeof(float));
}

void createEdgeDetect(float* filter) {
    float kernel[] = {
        -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1,
        -1, -1, 24, -1, -1,
        -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1
    };
    memcpy(filter, kernel, FILTER_SIZE * FILTER_SIZE * sizeof(float));
}

// ============================================================
// Stream-based Multi-Image Processing
// ============================================================

void processMultipleImages(
    float** h_inputs, float** h_outputs,
    int numImages, int width, int height
) {
    const int NUM_STREAMS = 4;
    size_t imageBytes = width * height * sizeof(float);
    
    cudaStream_t streams[NUM_STREAMS];
    float* d_input[NUM_STREAMS];
    float* d_output[NUM_STREAMS];
    
    // Create streams and allocate device memory
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamCreate(&streams[i]);
        cudaMalloc(&d_input[i], imageBytes);
        cudaMalloc(&d_output[i], imageBytes);
    }
    
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((width + BLOCK_SIZE - 1) / BLOCK_SIZE,
              (height + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // Process images with overlapping
    for (int img = 0; img < numImages; img++) {
        int streamIdx = img % NUM_STREAMS;
        
        // H2D
        cudaMemcpyAsync(d_input[streamIdx], h_inputs[img],
                        imageBytes, cudaMemcpyHostToDevice,
                        streams[streamIdx]);
        
        // Compute
        convShared<<<grid, block, 0, streams[streamIdx]>>>(
            d_output[streamIdx], d_input[streamIdx], width, height);
        
        // D2H
        cudaMemcpyAsync(h_outputs[img], d_output[streamIdx],
                        imageBytes, cudaMemcpyDeviceToHost,
                        streams[streamIdx]);
    }
    
    // Sync all streams
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamSynchronize(streams[i]);
    }
    
    // Cleanup
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamDestroy(streams[i]);
        cudaFree(d_input[i]);
        cudaFree(d_output[i]);
    }
}
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile multi_image_conv.cu
// multi_image_conv.cu - Stream-based multi-image processing
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16
#define FILTER_RADIUS 2
#define FILTER_SIZE (2 * FILTER_RADIUS + 1)

__constant__ float c_filter[FILTER_SIZE * FILTER_SIZE];

__global__ void convShared(
    float* output, const float* input,
    int width, int height
) {
    const int TILE_W = BLOCK_SIZE + 2 * FILTER_RADIUS;
    __shared__ float smem[TILE_W][TILE_W];
    
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int x = blockIdx.x * BLOCK_SIZE + tx;
    int y = blockIdx.y * BLOCK_SIZE + ty;
    
    int srcX = x - FILTER_RADIUS;
    int srcY = y - FILTER_RADIUS;
    
    if (srcX >= 0 && srcX < width && srcY >= 0 && srcY < height) {
        smem[ty][tx] = input[srcY * width + srcX];
    } else {
        smem[ty][tx] = 0.0f;
    }
    
    if (tx < 2 * FILTER_RADIUS) {
        int ax = srcX + BLOCK_SIZE;
        if (ax >= 0 && ax < width && srcY >= 0 && srcY < height) {
            smem[ty][tx + BLOCK_SIZE] = input[srcY * width + ax];
        } else {
            smem[ty][tx + BLOCK_SIZE] = 0.0f;
        }
    }
    
    if (ty < 2 * FILTER_RADIUS) {
        int ay = srcY + BLOCK_SIZE;
        if (srcX >= 0 && srcX < width && ay >= 0 && ay < height) {
            smem[ty + BLOCK_SIZE][tx] = input[ay * width + srcX];
        } else {
            smem[ty + BLOCK_SIZE][tx] = 0.0f;
        }
    }
    
    if (tx < 2 * FILTER_RADIUS && ty < 2 * FILTER_RADIUS) {
        int ax = srcX + BLOCK_SIZE;
        int ay = srcY + BLOCK_SIZE;
        if (ax >= 0 && ax < width && ay >= 0 && ay < height) {
            smem[ty + BLOCK_SIZE][tx + BLOCK_SIZE] = input[ay * width + ax];
        } else {
            smem[ty + BLOCK_SIZE][tx + BLOCK_SIZE] = 0.0f;
        }
    }
    
    __syncthreads();
    
    if (x < width && y < height) {
        float sum = 0.0f;
        
        #pragma unroll
        for (int fy = 0; fy < FILTER_SIZE; fy++) {
            #pragma unroll
            for (int fx = 0; fx < FILTER_SIZE; fx++) {
                sum += smem[ty + fy][tx + fx] * 
                       c_filter[fy * FILTER_SIZE + fx];
            }
        }
        
        output[y * width + x] = sum;
    }
}

void createGaussianBlur(float* filter) {
    float kernel[] = {
        1.0f/256, 4.0f/256,  7.0f/256,  4.0f/256, 1.0f/256,
        4.0f/256, 16.0f/256, 26.0f/256, 16.0f/256, 4.0f/256,
        7.0f/256, 26.0f/256, 41.0f/256, 26.0f/256, 7.0f/256,
        4.0f/256, 16.0f/256, 26.0f/256, 16.0f/256, 4.0f/256,
        1.0f/256, 4.0f/256,  7.0f/256,  4.0f/256, 1.0f/256
    };
    memcpy(filter, kernel, FILTER_SIZE * FILTER_SIZE * sizeof(float));
}

int main() {
    const int WIDTH = 2048;
    const int HEIGHT = 2048;
    const int NUM_IMAGES = 8;
    const int NUM_STREAMS = 4;
    size_t imageBytes = WIDTH * HEIGHT * sizeof(float);
    
    printf("Processing %d images of size %dx%d using %d streams\n",
           NUM_IMAGES, WIDTH, HEIGHT, NUM_STREAMS);
    
    // Allocate host memory for images
    float* h_inputs[NUM_IMAGES];
    float* h_outputs[NUM_IMAGES];
    for (int i = 0; i < NUM_IMAGES; i++) {
        cudaMallocHost(&h_inputs[i], imageBytes);
        cudaMallocHost(&h_outputs[i], imageBytes);
        // Initialize with different patterns
        for (int j = 0; j < WIDTH * HEIGHT; j++) {
            h_inputs[i][j] = (float)((i + j) % 256) / 255.0f;
        }
    }
    
    // Upload filter
    float h_filter[FILTER_SIZE * FILTER_SIZE];
    createGaussianBlur(h_filter);
    cudaMemcpyToSymbol(c_filter, h_filter, 
                       FILTER_SIZE * FILTER_SIZE * sizeof(float));
    
    // Create streams and allocate device memory
    cudaStream_t streams[NUM_STREAMS];
    float* d_input[NUM_STREAMS];
    float* d_output[NUM_STREAMS];
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamCreate(&streams[i]);
        cudaMalloc(&d_input[i], imageBytes);
        cudaMalloc(&d_output[i], imageBytes);
    }
    
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((WIDTH + BLOCK_SIZE - 1) / BLOCK_SIZE,
              (HEIGHT + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    // Process images with overlapping streams
    for (int img = 0; img < NUM_IMAGES; img++) {
        int streamIdx = img % NUM_STREAMS;
        
        // H2D
        cudaMemcpyAsync(d_input[streamIdx], h_inputs[img],
                        imageBytes, cudaMemcpyHostToDevice,
                        streams[streamIdx]);
        
        // Compute
        convShared<<<grid, block, 0, streams[streamIdx]>>>(
            d_output[streamIdx], d_input[streamIdx], WIDTH, HEIGHT);
        
        // D2H
        cudaMemcpyAsync(h_outputs[img], d_output[streamIdx],
                        imageBytes, cudaMemcpyDeviceToHost,
                        streams[streamIdx]);
    }
    
    // Sync all streams
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamSynchronize(streams[i]);
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    printf("Total time for %d images: %.2f ms\n", NUM_IMAGES, ms);
    printf("Average time per image: %.2f ms\n", ms / NUM_IMAGES);
    printf("Throughput: %.2f images/sec\n", NUM_IMAGES / (ms / 1000.0f));
    
    // Cleanup
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamDestroy(streams[i]);
        cudaFree(d_input[i]);
        cudaFree(d_output[i]);
    }
    
    for (int i = 0; i < NUM_IMAGES; i++) {
        cudaFreeHost(h_inputs[i]);
        cudaFreeHost(h_outputs[i]);
    }
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    printf("Multi-image processing complete!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -O3 -o multi_image_conv multi_image_conv.cu
!./multi_image_conv

---

## Part 3: CUDA Graph for Repeated Filters

### üî∑ CUDA C++ Implementation (Primary)

```cpp
// ============================================================
// CUDA Graph for Multi-Pass Filtering
// ============================================================

void multiPassFilter(
    float* d_output, float* d_input,
    float* d_temp,
    int width, int height,
    int numPasses
) {
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((width + BLOCK_SIZE - 1) / BLOCK_SIZE,
              (height + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    cudaGraph_t graph;
    cudaGraphExec_t instance;
    
    // Capture graph for one pass
    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
    
    // Forward pass: input -> temp
    convShared<<<grid, block, 0, stream>>>(
        d_temp, d_input, width, height);
    
    // Backward pass: temp -> output
    convShared<<<grid, block, 0, stream>>>(
        d_output, d_temp, width, height);
    
    cudaStreamEndCapture(stream, &graph);
    cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
    
    // Execute graph multiple times
    for (int pass = 0; pass < numPasses; pass++) {
        cudaGraphLaunch(instance, stream);
    }
    
    cudaStreamSynchronize(stream);
    
    cudaGraphExecDestroy(instance);
    cudaGraphDestroy(graph);
    cudaStreamDestroy(stream);
}
```

---

## Part 4: Main Program and Benchmarking

### üî∑ CUDA C++ Implementation (Primary)

```cpp
// ============================================================
// Main Program
// ============================================================

int main(int argc, char** argv) {
    const int WIDTH = 4096;
    const int HEIGHT = 4096;
    size_t imageBytes = WIDTH * HEIGHT * sizeof(float);
    
    // Allocate host memory (pinned for async transfers)
    float *h_input, *h_output;
    cudaMallocHost(&h_input, imageBytes);
    cudaMallocHost(&h_output, imageBytes);
    
    // Initialize with test pattern
    for (int i = 0; i < WIDTH * HEIGHT; i++) {
        h_input[i] = (float)(i % 256) / 255.0f;
    }
    
    // Allocate device memory
    float *d_input, *d_output;
    cudaMalloc(&d_input, imageBytes);
    cudaMalloc(&d_output, imageBytes);
    
    // Create and upload Gaussian blur filter
    float h_filter[FILTER_SIZE * FILTER_SIZE];
    createGaussianBlur(h_filter);
    cudaMemcpyToSymbol(c_filter, h_filter, 
                       FILTER_SIZE * FILTER_SIZE * sizeof(float));
    
    // Copy input to device
    cudaMemcpy(d_input, h_input, imageBytes, cudaMemcpyHostToDevice);
    
    // Setup kernel launch parameters
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((WIDTH + BLOCK_SIZE - 1) / BLOCK_SIZE,
              (HEIGHT + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // Warmup
    convShared<<<grid, block>>>(d_output, d_input, WIDTH, HEIGHT);
    cudaDeviceSynchronize();
    
    // =========================================
    // Benchmark: Naive vs Optimized
    // =========================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    const int RUNS = 100;
    float ms;
    
    // Naive kernel
    cudaEventRecord(start);
    for (int i = 0; i < RUNS; i++) {
        convNaive<<<grid, block>>>(d_output, d_input, WIDTH, HEIGHT);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    
    float naiveTime = ms / RUNS;
    printf("Naive convolution:     %.2f ms\n", naiveTime);
    
    // Optimized kernel
    cudaEventRecord(start);
    for (int i = 0; i < RUNS; i++) {
        convShared<<<grid, block>>>(d_output, d_input, WIDTH, HEIGHT);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    
    float optTime = ms / RUNS;
    printf("Optimized convolution: %.2f ms\n", optTime);
    printf("Speedup: %.2fx\n", naiveTime / optTime);
    
    // Calculate effective bandwidth
    float bandwidth = 2.0f * imageBytes / (optTime / 1000.0f) / 1e9;
    printf("Effective bandwidth:   %.2f GB/s\n", bandwidth);
    
    // Cleanup
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFreeHost(h_input);
    cudaFreeHost(h_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    printf("\nCapstone project complete!\n");
    return 0;
}

// Compile: nvcc -O3 -arch=sm_70 convolution.cu -o convolution
// Profile: ncu --set full ./convolution
```

---

## üéØ Capstone Project Exercises

### üî∑ CUDA C++ Exercises (Primary)

This comprehensive capstone project integrates all the CUDA concepts you've learned throughout the curriculum. You will implement a complete **Image Processing Pipeline** with multiple stages.

**Project Overview:**
Build an optimized image processing pipeline that includes:
1. **Grayscale Conversion** - RGB to grayscale with memory coalescing
2. **Gaussian Blur** - Shared memory tiled convolution
3. **Edge Detection** - Sobel filter implementation
4. **Histogram Equalization** - Parallel histogram and prefix scan
5. **Pipeline Integration** - Use streams for overlapping operations

**Grading Criteria:**
- ‚úÖ Correctness of each kernel
- ‚úÖ Proper memory management (no leaks)
- ‚úÖ Use of shared memory where appropriate
- ‚úÖ Memory coalescing optimization
- ‚úÖ Stream-based pipeline execution

In [None]:
%%writefile capstone_project.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

// Image dimensions (simulated)
#define IMG_WIDTH 1024
#define IMG_HEIGHT 1024
#define IMG_CHANNELS 3

// =============================================================================
// STAGE 1: Grayscale Conversion
// TODO: Implement RGB to grayscale with coalesced memory access
// Formula: gray = 0.299*R + 0.587*G + 0.114*B
// =============================================================================
__global__ void rgbToGrayscale(unsigned char* gray, unsigned char* rgb, 
                                int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < width && y < height) {
        int grayIdx = y * width + x;
        int rgbIdx = grayIdx * 3;
        
        // TODO: Implement grayscale conversion
        // Hint: Use the luminance formula
        unsigned char r = rgb[rgbIdx + 0];
        unsigned char g = rgb[rgbIdx + 1];
        unsigned char b = rgb[rgbIdx + 2];
        
        gray[grayIdx] = (unsigned char)(0.299f * r + 0.587f * g + 0.114f * b);
    }
}

// =============================================================================
// STAGE 2: Gaussian Blur (3x3)
// TODO: Implement using shared memory for the tile
// =============================================================================
#define BLUR_TILE_SIZE 16
#define BLUR_RADIUS 1

__constant__ float d_gaussianKernel[9] = {
    1.0f/16, 2.0f/16, 1.0f/16,
    2.0f/16, 4.0f/16, 2.0f/16,
    1.0f/16, 2.0f/16, 1.0f/16
};

__global__ void gaussianBlur(unsigned char* out, unsigned char* in,
                              int width, int height) {
    // Shared memory with halo for boundary
    __shared__ float tile[BLUR_TILE_SIZE + 2][BLUR_TILE_SIZE + 2];
    
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int x = blockIdx.x * BLUR_TILE_SIZE + tx;
    int y = blockIdx.y * BLUR_TILE_SIZE + ty;
    
    // TODO: Exercise - Load tile with halo into shared memory
    // Handle boundary conditions (clamp to edge)
    int loadX = x - BLUR_RADIUS;
    int loadY = y - BLUR_RADIUS;
    
    // Each thread loads one element of the tile
    // Some threads also load halo elements
    if (tx < BLUR_TILE_SIZE + 2 && ty < BLUR_TILE_SIZE + 2) {
        int srcX = min(max(loadX + tx, 0), width - 1);
        int srcY = min(max(loadY + ty, 0), height - 1);
        tile[ty][tx] = (float)in[srcY * width + srcX];
    }
    
    __syncthreads();
    
    // TODO: Apply convolution
    if (x < width && y < height && tx < BLUR_TILE_SIZE && ty < BLUR_TILE_SIZE) {
        float sum = 0.0f;
        
        for (int ky = 0; ky < 3; ky++) {
            for (int kx = 0; kx < 3; kx++) {
                sum += tile[ty + ky][tx + kx] * d_gaussianKernel[ky * 3 + kx];
            }
        }
        
        out[y * width + x] = (unsigned char)min(max(sum, 0.0f), 255.0f);
    }
}

// =============================================================================
// STAGE 3: Sobel Edge Detection
// TODO: Implement Sobel operator
// =============================================================================
__constant__ int d_sobelX[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
__constant__ int d_sobelY[9] = {-1, -2, -1, 0, 0, 0, 1, 2, 1};

__global__ void sobelEdgeDetection(unsigned char* out, unsigned char* in,
                                    int width, int height) {
    __shared__ float tile[18][18];  // 16 + 2 halo on each side
    
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int x = blockIdx.x * blockDim.x + tx;
    int y = blockIdx.y * blockDim.y + ty;
    
    // Load tile with halo
    int loadX = (int)(blockIdx.x * blockDim.x) - 1 + tx;
    int loadY = (int)(blockIdx.y * blockDim.y) - 1 + ty;
    
    if (tx < 18 && ty < 18) {
        int srcX = min(max(loadX, 0), width - 1);
        int srcY = min(max(loadY, 0), height - 1);
        tile[ty][tx] = (float)in[srcY * width + srcX];
    }
    
    // Load extra elements if needed (for larger halo)
    __syncthreads();
    
    if (x < width && y < height) {
        // TODO: Compute Gx and Gy using Sobel operators
        float gx = 0.0f, gy = 0.0f;
        
        for (int ky = 0; ky < 3; ky++) {
            for (int kx = 0; kx < 3; kx++) {
                float val = tile[ty + ky][tx + kx];
                gx += val * d_sobelX[ky * 3 + kx];
                gy += val * d_sobelY[ky * 3 + kx];
            }
        }
        
        // Gradient magnitude
        float mag = sqrtf(gx * gx + gy * gy);
        out[y * width + x] = (unsigned char)min(mag, 255.0f);
    }
}

// =============================================================================
// STAGE 4: Histogram Computation
// TODO: Implement parallel histogram with atomics
// =============================================================================
#define NUM_BINS 256

__global__ void computeHistogram(int* histogram, unsigned char* image,
                                  int width, int height) {
    __shared__ int localHist[NUM_BINS];
    
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int numPixels = width * height;
    
    // Initialize local histogram
    for (int i = tid; i < NUM_BINS; i += blockDim.x) {
        localHist[i] = 0;
    }
    __syncthreads();
    
    // TODO: Accumulate to local histogram
    if (idx < numPixels) {
        atomicAdd(&localHist[image[idx]], 1);
    }
    __syncthreads();
    
    // TODO: Merge to global histogram
    for (int i = tid; i < NUM_BINS; i += blockDim.x) {
        if (localHist[i] > 0) {
            atomicAdd(&histogram[i], localHist[i]);
        }
    }
}

// =============================================================================
// STAGE 5: Histogram Equalization (CDF computation + mapping)
// =============================================================================
__global__ void computeCDF(float* cdf, int* histogram, int numPixels) {
    __shared__ float temp[NUM_BINS];
    
    int tid = threadIdx.x;
    
    // Load and normalize
    temp[tid] = (float)histogram[tid] / numPixels;
    __syncthreads();
    
    // Inclusive scan (Blelloch-style up-sweep/down-sweep simplified)
    for (int stride = 1; stride < NUM_BINS; stride *= 2) {
        float val = 0;
        if (tid >= stride) {
            val = temp[tid - stride];
        }
        __syncthreads();
        if (tid >= stride) {
            temp[tid] += val;
        }
        __syncthreads();
    }
    
    cdf[tid] = temp[tid];
}

__global__ void applyEqualization(unsigned char* out, unsigned char* in,
                                   float* cdf, float cdfMin,
                                   int width, int height) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int numPixels = width * height;
    
    if (idx < numPixels) {
        unsigned char val = in[idx];
        float equalized = (cdf[val] - cdfMin) / (1.0f - cdfMin) * 255.0f;
        out[idx] = (unsigned char)min(max(equalized, 0.0f), 255.0f);
    }
}

// =============================================================================
// FULL PIPELINE with Streams
// =============================================================================
void runPipeline(unsigned char* h_rgb, unsigned char* h_output,
                 int width, int height) {
    printf("Running full image processing pipeline...\n");
    
    size_t rgbSize = width * height * 3;
    size_t graySize = width * height;
    
    // Device memory
    unsigned char *d_rgb, *d_gray, *d_blurred, *d_edges, *d_equalized;
    int *d_histogram;
    float *d_cdf;
    
    CHECK_CUDA(cudaMalloc(&d_rgb, rgbSize));
    CHECK_CUDA(cudaMalloc(&d_gray, graySize));
    CHECK_CUDA(cudaMalloc(&d_blurred, graySize));
    CHECK_CUDA(cudaMalloc(&d_edges, graySize));
    CHECK_CUDA(cudaMalloc(&d_equalized, graySize));
    CHECK_CUDA(cudaMalloc(&d_histogram, NUM_BINS * sizeof(int)));
    CHECK_CUDA(cudaMalloc(&d_cdf, NUM_BINS * sizeof(float)));
    
    // Create stream
    cudaStream_t stream;
    CHECK_CUDA(cudaStreamCreate(&stream));
    
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));
    
    // Copy input
    CHECK_CUDA(cudaMemcpyAsync(d_rgb, h_rgb, rgbSize, cudaMemcpyHostToDevice, stream));
    CHECK_CUDA(cudaMemsetAsync(d_histogram, 0, NUM_BINS * sizeof(int), stream));
    
    CHECK_CUDA(cudaEventRecord(start, stream));
    
    // Stage 1: Grayscale
    dim3 block2D(16, 16);
    dim3 grid2D((width + 15) / 16, (height + 15) / 16);
    rgbToGrayscale<<<grid2D, block2D, 0, stream>>>(d_gray, d_rgb, width, height);
    
    // Stage 2: Gaussian Blur
    dim3 blurGrid((width + BLUR_TILE_SIZE - 1) / BLUR_TILE_SIZE,
                  (height + BLUR_TILE_SIZE - 1) / BLUR_TILE_SIZE);
    dim3 blurBlock(BLUR_TILE_SIZE + 2, BLUR_TILE_SIZE + 2);
    gaussianBlur<<<blurGrid, blurBlock, 0, stream>>>(d_blurred, d_gray, width, height);
    
    // Stage 3: Edge Detection
    sobelEdgeDetection<<<grid2D, block2D, 0, stream>>>(d_edges, d_blurred, width, height);
    
    // Stage 4: Histogram
    int histBlocks = (width * height + 255) / 256;
    computeHistogram<<<histBlocks, 256, 0, stream>>>(d_histogram, d_blurred, width, height);
    
    // Stage 5: CDF and Equalization
    computeCDF<<<1, NUM_BINS, 0, stream>>>(d_cdf, d_histogram, width * height);
    
    // Get cdfMin (simplified - normally you'd find this on GPU)
    float h_cdf[NUM_BINS];
    CHECK_CUDA(cudaMemcpyAsync(h_cdf, d_cdf, NUM_BINS * sizeof(float), 
                                cudaMemcpyDeviceToHost, stream));
    CHECK_CUDA(cudaStreamSynchronize(stream));
    
    float cdfMin = 0.0f;
    for (int i = 0; i < NUM_BINS; i++) {
        if (h_cdf[i] > 0) {
            cdfMin = h_cdf[i];
            break;
        }
    }
    
    int eqBlocks = (width * height + 255) / 256;
    applyEqualization<<<eqBlocks, 256, 0, stream>>>(d_equalized, d_blurred, 
                                                     d_cdf, cdfMin, width, height);
    
    CHECK_CUDA(cudaEventRecord(stop, stream));
    
    // Copy results
    CHECK_CUDA(cudaMemcpyAsync(h_output, d_edges, graySize, cudaMemcpyDeviceToHost, stream));
    CHECK_CUDA(cudaStreamSynchronize(stream));
    
    float pipelineTime;
    CHECK_CUDA(cudaEventElapsedTime(&pipelineTime, start, stop));
    
    printf("Pipeline completed in %.3f ms\n", pipelineTime);
    printf("Throughput: %.2f Mpixels/sec\n", 
           (width * height) / (pipelineTime * 1000.0f));
    
    // Cleanup
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    CHECK_CUDA(cudaStreamDestroy(stream));
    CHECK_CUDA(cudaFree(d_rgb));
    CHECK_CUDA(cudaFree(d_gray));
    CHECK_CUDA(cudaFree(d_blurred));
    CHECK_CUDA(cudaFree(d_edges));
    CHECK_CUDA(cudaFree(d_equalized));
    CHECK_CUDA(cudaFree(d_histogram));
    CHECK_CUDA(cudaFree(d_cdf));
}

// =============================================================================
// Test and Verification
// =============================================================================
void verifyStages() {
    printf("\n‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó\n");
    printf("‚ïë            Capstone: Image Processing Pipeline               ‚ïë\n");
    printf("‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù\n\n");
    
    int width = IMG_WIDTH;
    int height = IMG_HEIGHT;
    
    // Create test image (gradient pattern)
    unsigned char* h_rgb = (unsigned char*)malloc(width * height * 3);
    unsigned char* h_output = (unsigned char*)malloc(width * height);
    
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            int idx = (y * width + x) * 3;
            h_rgb[idx + 0] = (x * 255) / width;      // R gradient
            h_rgb[idx + 1] = (y * 255) / height;     // G gradient
            h_rgb[idx + 2] = 128;                     // B constant
        }
    }
    
    printf("Test image: %dx%d RGB\n\n", width, height);
    
    // Run individual stage tests
    printf("=== Testing Individual Stages ===\n");
    
    // Test grayscale
    unsigned char *d_rgb, *d_gray;
    CHECK_CUDA(cudaMalloc(&d_rgb, width * height * 3));
    CHECK_CUDA(cudaMalloc(&d_gray, width * height));
    CHECK_CUDA(cudaMemcpy(d_rgb, h_rgb, width * height * 3, cudaMemcpyHostToDevice));
    
    dim3 block(16, 16);
    dim3 grid((width + 15) / 16, (height + 15) / 16);
    rgbToGrayscale<<<grid, block>>>(d_gray, d_rgb, width, height);
    CHECK_CUDA(cudaDeviceSynchronize());
    printf("‚úì Grayscale conversion: OK\n");
    
    // Test blur
    unsigned char* d_blurred;
    CHECK_CUDA(cudaMalloc(&d_blurred, width * height));
    dim3 blurGrid((width + BLUR_TILE_SIZE - 1) / BLUR_TILE_SIZE,
                  (height + BLUR_TILE_SIZE - 1) / BLUR_TILE_SIZE);
    dim3 blurBlock(BLUR_TILE_SIZE + 2, BLUR_TILE_SIZE + 2);
    gaussianBlur<<<blurGrid, blurBlock>>>(d_blurred, d_gray, width, height);
    CHECK_CUDA(cudaDeviceSynchronize());
    printf("‚úì Gaussian blur: OK\n");
    
    // Test edge detection
    unsigned char* d_edges;
    CHECK_CUDA(cudaMalloc(&d_edges, width * height));
    sobelEdgeDetection<<<grid, block>>>(d_edges, d_blurred, width, height);
    CHECK_CUDA(cudaDeviceSynchronize());
    printf("‚úì Edge detection: OK\n");
    
    // Test histogram
    int* d_histogram;
    CHECK_CUDA(cudaMalloc(&d_histogram, NUM_BINS * sizeof(int)));
    CHECK_CUDA(cudaMemset(d_histogram, 0, NUM_BINS * sizeof(int)));
    int histBlocks = (width * height + 255) / 256;
    computeHistogram<<<histBlocks, 256>>>(d_histogram, d_gray, width, height);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    int h_histogram[NUM_BINS];
    CHECK_CUDA(cudaMemcpy(h_histogram, d_histogram, NUM_BINS * sizeof(int), 
                          cudaMemcpyDeviceToHost));
    int totalCount = 0;
    for (int i = 0; i < NUM_BINS; i++) totalCount += h_histogram[i];
    printf("‚úì Histogram (total count: %d, expected: %d): %s\n", 
           totalCount, width * height, 
           (totalCount == width * height) ? "OK" : "MISMATCH");
    
    cudaFree(d_rgb);
    cudaFree(d_gray);
    cudaFree(d_blurred);
    cudaFree(d_edges);
    cudaFree(d_histogram);
    
    printf("\n=== Running Full Pipeline ===\n");
    runPipeline(h_rgb, h_output, width, height);
    
    printf("\n=== Capstone Summary ===\n");
    printf("All stages implemented and tested!\n");
    printf("Next steps:\n");
    printf("  1. Profile with Nsight Compute\n");
    printf("  2. Optimize memory access patterns\n");
    printf("  3. Experiment with different tile sizes\n");
    printf("  4. Add multi-stream overlapping\n");
    printf("‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê\n");
    
    free(h_rgb);
    free(h_output);
}

int main() {
    verifyStages();
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -O3 -o capstone_project capstone_project.cu && ./capstone_project

### üî∂ Python/Numba Capstone Exercises (Optional)

For those preferring Python, here's a Numba-based version of the image processing pipeline:

```python
from numba import cuda
import numpy as np
import math

# Exercise: Implement the pipeline stages in Numba

@cuda.jit
def rgb_to_grayscale(gray, rgb, width, height):
    """Convert RGB image to grayscale"""
    x, y = cuda.grid(2)
    if x < width and y < height:
        idx = y * width + x
        r = rgb[idx, 0]
        g = rgb[idx, 1]
        b = rgb[idx, 2]
        gray[idx] = 0.299 * r + 0.587 * g + 0.114 * b

@cuda.jit
def gaussian_blur_3x3(out, inp, width, height):
    """Apply 3x3 Gaussian blur using shared memory"""
    # TODO: Implement with shared memory tile
    tile = cuda.shared.array((18, 18), dtype=numba.float32)
    
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    x = cuda.blockIdx.x * 16 + tx
    y = cuda.blockIdx.y * 16 + ty
    
    # Load tile with halo...
    # Apply convolution...

@cuda.jit
def sobel_edge_detection(out, inp, width, height):
    """Sobel edge detection"""
    x, y = cuda.grid(2)
    if 0 < x < width - 1 and 0 < y < height - 1:
        # Compute gradients
        gx = (-1 * inp[(y-1) * width + (x-1)] + 1 * inp[(y-1) * width + (x+1)] +
              -2 * inp[y * width + (x-1)] + 2 * inp[y * width + (x+1)] +
              -1 * inp[(y+1) * width + (x-1)] + 1 * inp[(y+1) * width + (x+1)])
        
        gy = (-1 * inp[(y-1) * width + (x-1)] - 2 * inp[(y-1) * width + x] - 1 * inp[(y-1) * width + (x+1)] +
               1 * inp[(y+1) * width + (x-1)] + 2 * inp[(y+1) * width + x] + 1 * inp[(y+1) * width + (x+1)])
        
        mag = math.sqrt(gx * gx + gy * gy)
        out[y * width + x] = min(mag, 255.0)

# Full pipeline
def run_pipeline(rgb_image):
    width, height = rgb_image.shape[1], rgb_image.shape[0]
    
    # Allocate device memory
    d_rgb = cuda.to_device(rgb_image)
    d_gray = cuda.device_array(width * height, dtype=np.float32)
    d_edges = cuda.device_array(width * height, dtype=np.float32)
    
    # Launch kernels
    block = (16, 16)
    grid = ((width + 15) // 16, (height + 15) // 16)
    
    rgb_to_grayscale[grid, block](d_gray, d_rgb, width, height)
    sobel_edge_detection[grid, block](d_edges, d_gray, width, height)
    
    return d_edges.copy_to_host()
```

**Challenge**: Extend with histogram equalization and stream-based overlapping!

---

## Capstone Extensions (Optional Challenges)

### Extension 1: Multi-GPU Large Image Processing
- Split large images across GPUs
- Handle halo exchange for overlapping regions

### Extension 2: Real-time Video Filtering
- Process video frames in a pipeline
- Use streams for frame-level parallelism

### Extension 3: Separable Filters
- Implement separable convolution (2 1D passes)
- Compare performance with 2D convolution

### Extension 4: cuDNN Comparison
- Compare your implementation with cuDNN
- Analyze where cuDNN does better

---

## üéâ Curriculum Complete!

```
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë                                                              ‚ïë
‚ïë          üéì CONGRATULATIONS! üéì                              ‚ïë
‚ïë                                                              ‚ïë
‚ïë   You have completed the 12-week CUDA curriculum!           ‚ïë
‚ïë                                                              ‚ïë
‚ïë   Skills Mastered:                                          ‚ïë
‚ïë   ‚úì GPU Architecture & Programming Model                    ‚ïë
‚ïë   ‚úì Thread Indexing & Memory Hierarchies                    ‚ïë
‚ïë   ‚úì Shared Memory & Synchronization                         ‚ïë
‚ïë   ‚úì Parallel Patterns (Reduce, Scan, Stencil)               ‚ïë
‚ïë   ‚úì Memory Optimization & Coalescing                        ‚ïë
‚ïë   ‚úì Occupancy & Performance Analysis                        ‚ïë
‚ïë   ‚úì Streams & Asynchronous Execution                        ‚ïë
‚ïë   ‚úì CUDA Graphs                                             ‚ïë
‚ïë   ‚úì Cooperative Groups & Dynamic Parallelism                ‚ïë
‚ïë   ‚úì Multi-GPU Programming                                   ‚ïë
‚ïë   ‚úì Profiling with Nsight                                   ‚ïë
‚ïë                                                              ‚ïë
‚ïë   Next Steps:                                                ‚ïë
‚ïë   ‚Ä¢ Build real projects with GPU acceleration               ‚ïë
‚ïë   ‚Ä¢ Explore CUDA libraries (cuBLAS, cuDNN, Thrust)          ‚ïë
‚ïë   ‚Ä¢ Learn about tensor cores and mixed precision            ‚ïë
‚ïë   ‚Ä¢ Contribute to open-source GPU projects                  ‚ïë
‚ïë                                                              ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
```