In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: Timing Fundamentals

### CUDA Events (Gold Standard)

```cpp
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start);
kernel<<<...>>>();
cudaEventRecord(stop);

cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
```

### Common Pitfalls

```
‚ùå WRONG: Using CPU timers without sync
   auto t1 = std::chrono::now();
   kernel<<<...>>>();  // Async!
   auto t2 = std::chrono::now();  // Only measures launch overhead

‚úÖ CORRECT: Use CUDA events or synchronize
   cudaDeviceSynchronize();
   auto t1 = std::chrono::now();
   kernel<<<...>>>();
   cudaDeviceSynchronize();
   auto t2 = std::chrono::now();
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile benchmark_framework.cu
// benchmark_framework.cu - Professional benchmarking utilities
#include <stdio.h>
#include <cuda_runtime.h>
#include <algorithm>
#include <vector>
#include <cmath>

// Statistics structure
struct BenchmarkStats {
    float mean;
    float std_dev;
    float median;
    float min;
    float max;
    float p95;  // 95th percentile
};

BenchmarkStats computeStats(std::vector<float>& times) {
    BenchmarkStats stats;
    int n = times.size();
    
    std::sort(times.begin(), times.end());
    
    // Mean
    float sum = 0;
    for (float t : times) sum += t;
    stats.mean = sum / n;
    
    // Std dev
    float sq_sum = 0;
    for (float t : times) sq_sum += (t - stats.mean) * (t - stats.mean);
    stats.std_dev = sqrtf(sq_sum / (n - 1));
    
    // Median
    stats.median = (n % 2 == 0) ? 
        (times[n/2 - 1] + times[n/2]) / 2 : times[n/2];
    
    // Min, Max
    stats.min = times[0];
    stats.max = times[n-1];
    
    // 95th percentile
    stats.p95 = times[(int)(n * 0.95)];
    
    return stats;
}

void printStats(const char* name, BenchmarkStats& stats) {
    printf("%s:\n", name);
    printf("  Mean:   %.3f ms\n", stats.mean);
    printf("  Median: %.3f ms\n", stats.median);
    printf("  StdDev: %.3f ms (%.1f%%)\n", stats.std_dev, 100 * stats.std_dev / stats.mean);
    printf("  Min:    %.3f ms\n", stats.min);
    printf("  Max:    %.3f ms\n", stats.max);
    printf("  P95:    %.3f ms\n", stats.p95);
}

// Example kernel to benchmark
__global__ void vector_add(float* a, float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) c[idx] = a[idx] + b[idx];
}

int main() {
    printf("=== Professional Benchmarking Framework ===\n\n");
    
    const int N = 100000000;  // 100M elements
    const int WARMUP = 10;
    const int ITERATIONS = 100;
    
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_c, N * sizeof(float));
    
    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    
    // Warmup (important!)
    printf("Warming up (%d iterations)...\n", WARMUP);
    for (int i = 0; i < WARMUP; i++) {
        vector_add<<<blocks, threads>>>(d_a, d_b, d_c, N);
    }
    cudaDeviceSynchronize();
    
    // Benchmark
    printf("Benchmarking (%d iterations)...\n\n", ITERATIONS);
    
    std::vector<float> times;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    for (int i = 0; i < ITERATIONS; i++) {
        cudaEventRecord(start);
        vector_add<<<blocks, threads>>>(d_a, d_b, d_c, N);
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        
        float ms;
        cudaEventElapsedTime(&ms, start, stop);
        times.push_back(ms);
    }
    
    auto stats = computeStats(times);
    printStats("Vector Addition (100M elements)", stats);
    
    // Calculate bandwidth
    float gb = 3 * N * sizeof(float) / 1e9;  // Read a, b; write c
    float bandwidth = gb / (stats.median / 1000);  // GB/s
    printf("\nBandwidth Analysis:\n");
    printf("  Data moved: %.2f GB\n", gb);
    printf("  Effective bandwidth: %.1f GB/s\n", bandwidth);
    
    // Get theoretical bandwidth
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    float peak_bw = prop.memoryClockRate * 1e3 * (prop.memoryBusWidth / 8) * 2 / 1e9;
    printf("  Peak bandwidth: %.1f GB/s\n", peak_bw);
    printf("  Efficiency: %.1f%%\n", 100 * bandwidth / peak_bw);
    
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    cudaEventDestroy(start); cudaEventDestroy(stop);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o benchmark_framework benchmark_framework.cu
!./benchmark_framework

---

## Part 2: Performance Metrics

### Key Metrics

```
1. Memory Bandwidth (GB/s)
   Bandwidth = Bytes Transferred / Time
   
   For memory-bound kernels, compare to peak:
   - 80%+ efficiency = excellent
   - 60-80% = good
   - <60% = needs optimization

2. Compute Throughput (FLOPS/TFLOPS)
   FLOPS = Floating-point Operations / Time
   
   For compute-bound kernels:
   - Compare to theoretical peak
   - Consider Tensor Core vs CUDA Core

3. Occupancy
   Occupancy = Active Warps / Max Warps
   
   Limited by:
   - Registers per thread
   - Shared memory per block
   - Threads per block
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile roofline_analysis.cu
// roofline_analysis.cu - Roofline model analysis
#include <stdio.h>
#include <cuda_runtime.h>

// Memory-bound kernel (low arithmetic intensity)
__global__ void memory_bound_kernel(float* in, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        out[idx] = in[idx] * 2.0f;  // 1 FLOP per 8 bytes
    }
}

// Compute-bound kernel (high arithmetic intensity)
__global__ void compute_bound_kernel(float* in, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float val = in[idx];
        // Lots of compute for same memory access
        for (int i = 0; i < 100; i++) {
            val = val * val + val;
            val = sqrtf(fabsf(val)) + 1.0f;
        }
        out[idx] = val;
    }
}

float benchmark(void (*kernel)(float*, float*, int), float* d_in, float* d_out, int n, const char* name) {
    int threads = 256;
    int blocks = (n + threads - 1) / threads;
    
    // Warmup
    for (int i = 0; i < 5; i++) {
        kernel<<<blocks, threads>>>(d_in, d_out, n);
    }
    cudaDeviceSynchronize();
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        kernel<<<blocks, threads>>>(d_in, d_out, n);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    return ms / 100;  // Average per iteration
}

int main() {
    printf("=== Roofline Model Analysis ===\n\n");
    
    const int N = 10000000;  // 10M elements
    
    float *d_in, *d_out;
    cudaMalloc(&d_in, N * sizeof(float));
    cudaMalloc(&d_out, N * sizeof(float));
    
    // Get device properties
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    
    float peak_bw = prop.memoryClockRate * 1e3 * (prop.memoryBusWidth / 8) * 2 / 1e9;
    float peak_flops = prop.clockRate * 1e3 * prop.multiProcessorCount * 
                       128 * 2 / 1e12;  // Approximate FP32 peak TFLOPS
    
    printf("Device: %s\n", prop.name);
    printf("Peak Memory BW: %.0f GB/s\n", peak_bw);
    printf("Peak FP32: ~%.1f TFLOPS\n\n", peak_flops);
    
    // Memory-bound kernel
    float mem_time = benchmark(memory_bound_kernel, d_in, d_out, N, "Memory-bound");
    float mem_bytes = 2 * N * sizeof(float);  // Read + Write
    float mem_bw = mem_bytes / (mem_time * 1e6);  // GB/s
    float mem_flops = N;  // 1 multiply per element
    float mem_ai = mem_flops / mem_bytes;  // Arithmetic intensity
    
    printf("Memory-Bound Kernel:\n");
    printf("  Time: %.3f ms\n", mem_time);
    printf("  Bandwidth: %.1f GB/s (%.1f%% of peak)\n", mem_bw, 100 * mem_bw / peak_bw);
    printf("  Arithmetic Intensity: %.4f FLOP/Byte\n\n", mem_ai);
    
    // Compute-bound kernel
    float comp_time = benchmark(compute_bound_kernel, d_in, d_out, N, "Compute-bound");
    float comp_bytes = 2 * N * sizeof(float);
    float comp_flops = N * 100 * 4;  // ~4 ops per iteration * 100 iterations
    float comp_gflops = comp_flops / (comp_time * 1e6);
    float comp_ai = comp_flops / comp_bytes;
    
    printf("Compute-Bound Kernel:\n");
    printf("  Time: %.3f ms\n", comp_time);
    printf("  Throughput: %.1f GFLOPS\n", comp_gflops);
    printf("  Arithmetic Intensity: %.1f FLOP/Byte\n\n", comp_ai);
    
    // Roofline ridge point
    float ridge = peak_flops * 1e3 / peak_bw;  // GFLOPS/GB/s = FLOP/Byte
    printf("Roofline Analysis:\n");
    printf("  Ridge Point: %.1f FLOP/Byte\n", ridge);
    printf("  Memory-bound kernel: AI < ridge (limited by BW)\n");
    printf("  Compute-bound kernel: AI > ridge (limited by compute)\n");
    
    cudaFree(d_in); cudaFree(d_out);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o roofline_analysis roofline_analysis.cu
!./roofline_analysis

---

## Part 3: Profiling Tools

### NVIDIA Tools

```bash
# Nsight Compute (kernel analysis)
ncu --target-processes all ./my_program
ncu --set full -o profile ./my_program  # Full analysis

# Nsight Systems (timeline)
nsys profile ./my_program
nsys profile --cuda-memory-usage=true ./my_program

# Legacy nvprof (if available)
nvprof ./my_program
nvprof --metrics all ./my_program
```

### Key Metrics to Check

```
From ncu:
  - SM Efficiency: Are SMs busy?
  - Memory Throughput: How much BW used?
  - Achieved Occupancy: Warps active?
  - Warp Stall Reasons: What's blocking?
  
From nsys:
  - Timeline: Overlap of compute/memory
  - API calls: Host-side overhead
  - Memory transfers: H2D/D2H bottlenecks
```

In [None]:
# Check available profiling tools
!which ncu nsys nvprof 2>/dev/null || echo "Profiling tools not in PATH"
!ncu --version 2>/dev/null || echo "ncu not available"

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile benchmarking_exercises.cu
// CUDA C++ Exercises - Benchmarking
#include <cuda_runtime.h>
#include <stdio.h>

// Exercise 1: Implement proper CUDA event timing wrapper
// TODO: Add your implementation here

// Exercise 2: Create bandwidth calculation utility
// TODO: Add your implementation here

// Exercise 3: Build statistical analysis for benchmark runs
// TODO: Add your implementation here

int main() {
    printf("=== Benchmarking Exercises ===\n");
    printf("Implement the exercises above and run!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o benchmarking_exercises benchmarking_exercises.cu && ./benchmarking_exercises

### üî∂ Python/Numba Exercises (Optional)

The following exercises use Python and Numba for rapid prototyping. Complete the CUDA C++ exercises above first for the primary learning objectives.

---

## Summary

### Benchmarking Checklist

‚úÖ **Always**:
1. Use CUDA events for timing
2. Run warmup iterations first
3. Collect multiple samples
4. Report median AND variance
5. Calculate relevant metrics (BW or FLOPS)

‚ùå **Never**:
1. Use CPU timers without sync
2. Report single-run results
3. Ignore cold start effects
4. Compare across different GPUs

### Curriculum Complete! üéâ

You've completed the 14-week CUDA curriculum covering:
- GPU fundamentals and memory hierarchy
- Kernel optimization techniques
- Advanced topics (streams, multi-GPU)
- Tensor Cores and mixed precision
- Real-world applications and profiling