In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: Why Kernel Fusion?

### Memory Bandwidth is the Bottleneck

```
Unfused (3 kernels):
  Input ‚Üí [Load] ‚Üí Kernel1 ‚Üí [Store] ‚Üí Global Memory
                                            ‚Üì
  Global Memory ‚Üí [Load] ‚Üí Kernel2 ‚Üí [Store] ‚Üí Global Memory
                                                     ‚Üì
  Global Memory ‚Üí [Load] ‚Üí Kernel3 ‚Üí [Store] ‚Üí Output

  Total global memory access: 6 loads/stores!

Fused (1 kernel):
  Input ‚Üí [Load] ‚Üí Kernel(1+2+3) ‚Üí [Store] ‚Üí Output
  
  Total global memory access: 2 loads/stores!
  
  Speedup: Up to 3x for memory-bound operations
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile fusion_demo.cu
// fusion_demo.cu - Demonstrating fusion benefits
#include <stdio.h>
#include <cuda_runtime.h>

// ========== UNFUSED VERSION ==========
// 3 separate kernels

__global__ void add_kernel(float* a, float* b, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) out[idx] = a[idx] + b[idx];
}

__global__ void mul_kernel(float* in, float scalar, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) out[idx] = in[idx] * scalar;
}

__global__ void relu_kernel(float* in, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) out[idx] = fmaxf(in[idx], 0.0f);
}

// ========== FUSED VERSION ==========
// Single kernel doing add + mul + relu

__global__ void fused_add_mul_relu(float* a, float* b, float scalar, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float val = a[idx] + b[idx];  // Add
        val = val * scalar;            // Multiply
        out[idx] = fmaxf(val, 0.0f);  // ReLU
    }
}

int main() {
    printf("=== Kernel Fusion Demonstration ===\n\n");
    
    const int N = 100000000;  // 100M elements
    const float scalar = 2.0f;
    
    float *d_a, *d_b, *d_temp1, *d_temp2, *d_out;
    cudaMalloc(&d_a, N * sizeof(float));
    cudaMalloc(&d_b, N * sizeof(float));
    cudaMalloc(&d_temp1, N * sizeof(float));
    cudaMalloc(&d_temp2, N * sizeof(float));
    cudaMalloc(&d_out, N * sizeof(float));
    
    // Initialize
    cudaMemset(d_a, 1, N * sizeof(float));
    cudaMemset(d_b, 1, N * sizeof(float));
    
    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // ===== Unfused benchmark =====
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        add_kernel<<<blocks, threads>>>(d_a, d_b, d_temp1, N);
        mul_kernel<<<blocks, threads>>>(d_temp1, scalar, d_temp2, N);
        relu_kernel<<<blocks, threads>>>(d_temp2, d_out, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float unfused_ms;
    cudaEventElapsedTime(&unfused_ms, start, stop);
    
    // ===== Fused benchmark =====
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        fused_add_mul_relu<<<blocks, threads>>>(d_a, d_b, scalar, d_out, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float fused_ms;
    cudaEventElapsedTime(&fused_ms, start, stop);
    
    printf("Array size: %d elements (%.0f MB)\n\n", N, N * 4.0 / 1e6);
    printf("Unfused (3 kernels): %.2f ms\n", unfused_ms);
    printf("Fused (1 kernel):    %.2f ms\n", fused_ms);
    printf("Speedup: %.2fx\n", unfused_ms / fused_ms);
    
    // Memory analysis
    printf("\nMemory Traffic Analysis:\n");
    printf("  Unfused: 6 * %.0f MB = %.0f MB\n", N * 4.0 / 1e6, 6 * N * 4.0 / 1e6);
    printf("  Fused:   3 * %.0f MB = %.0f MB\n", N * 4.0 / 1e6, 3 * N * 4.0 / 1e6);
    printf("  Bandwidth saved: 50%%\n");
    
    cudaFree(d_a); cudaFree(d_b);
    cudaFree(d_temp1); cudaFree(d_temp2); cudaFree(d_out);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o fusion_demo fusion_demo.cu
!./fusion_demo

---

## Part 2: Fused Softmax

### Softmax Formula

```
softmax(x_i) = exp(x_i - max(x)) / sum(exp(x - max(x)))

Naive implementation (4 passes):
  1. Find max
  2. Subtract max and exp
  3. Sum
  4. Divide

Fused implementation (1 pass with online algorithm):
  - Compute max, sum, output together
  - Use warp reductions
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile fused_softmax.cu
// fused_softmax.cu - Fused softmax kernel
#include <stdio.h>
#include <cuda_runtime.h>
#include <float.h>

// Warp-level reduction for max
__inline__ __device__ float warpReduceMax(float val) {
    for (int offset = 16; offset > 0; offset /= 2)
        val = fmaxf(val, __shfl_down_sync(0xffffffff, val, offset));
    return val;
}

// Warp-level reduction for sum
__inline__ __device__ float warpReduceSum(float val) {
    for (int offset = 16; offset > 0; offset /= 2)
        val += __shfl_down_sync(0xffffffff, val, offset);
    return val;
}

// Fused softmax: one row per warp (for row_size <= 32)
__global__ void fused_softmax_warp(
    float* input, float* output,
    int rows, int cols
) {
    int row = blockIdx.x * (blockDim.x / 32) + threadIdx.x / 32;
    int lane = threadIdx.x % 32;
    
    if (row >= rows) return;
    
    float* row_in = input + row * cols;
    float* row_out = output + row * cols;
    
    // Each thread loads one element (assuming cols <= 32)
    float val = (lane < cols) ? row_in[lane] : -FLT_MAX;
    
    // Step 1: Find max
    float max_val = warpReduceMax(val);
    max_val = __shfl_sync(0xffffffff, max_val, 0);  // Broadcast
    
    // Step 2: Compute exp(x - max)
    float exp_val = (lane < cols) ? expf(val - max_val) : 0.0f;
    
    // Step 3: Sum of exp values
    float sum_exp = warpReduceSum(exp_val);
    sum_exp = __shfl_sync(0xffffffff, sum_exp, 0);  // Broadcast
    
    // Step 4: Divide
    if (lane < cols) {
        row_out[lane] = exp_val / sum_exp;
    }
}

// Naive softmax for comparison (multiple passes)
__global__ void naive_softmax_pass1(float* input, float* max_vals, int rows, int cols) {
    int row = blockIdx.x;
    if (row >= rows) return;
    
    float max_val = -FLT_MAX;
    for (int i = 0; i < cols; i++) {
        max_val = fmaxf(max_val, input[row * cols + i]);
    }
    max_vals[row] = max_val;
}

__global__ void naive_softmax_pass2(float* input, float* max_vals, float* sum_vals, int rows, int cols) {
    int row = blockIdx.x;
    if (row >= rows) return;
    
    float sum = 0.0f;
    float max_val = max_vals[row];
    for (int i = 0; i < cols; i++) {
        sum += expf(input[row * cols + i] - max_val);
    }
    sum_vals[row] = sum;
}

__global__ void naive_softmax_pass3(float* input, float* output, float* max_vals, float* sum_vals, int rows, int cols) {
    int row = blockIdx.x;
    if (row >= rows) return;
    
    float max_val = max_vals[row];
    float sum = sum_vals[row];
    for (int i = 0; i < cols; i++) {
        output[row * cols + i] = expf(input[row * cols + i] - max_val) / sum;
    }
}

int main() {
    printf("=== Fused vs Naive Softmax ===\n\n");
    
    const int rows = 100000;
    const int cols = 32;  // Common for attention heads
    
    float *d_input, *d_output, *d_max, *d_sum;
    cudaMalloc(&d_input, rows * cols * sizeof(float));
    cudaMalloc(&d_output, rows * cols * sizeof(float));
    cudaMalloc(&d_max, rows * sizeof(float));
    cudaMalloc(&d_sum, rows * sizeof(float));
    
    // Random init
    float* h_input = new float[rows * cols];
    for (int i = 0; i < rows * cols; i++) h_input[i] = (rand() % 1000) / 100.0f - 5.0f;
    cudaMemcpy(d_input, h_input, rows * cols * sizeof(float), cudaMemcpyHostToDevice);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Naive benchmark
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        naive_softmax_pass1<<<rows, 1>>>(d_input, d_max, rows, cols);
        naive_softmax_pass2<<<rows, 1>>>(d_input, d_max, d_sum, rows, cols);
        naive_softmax_pass3<<<rows, 1>>>(d_input, d_output, d_max, d_sum, rows, cols);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float naive_ms;
    cudaEventElapsedTime(&naive_ms, start, stop);
    
    // Fused benchmark
    int warps_per_block = 8;
    int blocks = (rows + warps_per_block - 1) / warps_per_block;
    
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        fused_softmax_warp<<<blocks, warps_per_block * 32>>>(d_input, d_output, rows, cols);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float fused_ms;
    cudaEventElapsedTime(&fused_ms, start, stop);
    
    printf("Matrix: %d rows x %d cols\n\n", rows, cols);
    printf("Naive (3 passes):  %.2f ms\n", naive_ms);
    printf("Fused (1 pass):    %.2f ms\n", fused_ms);
    printf("Speedup: %.2fx\n", naive_ms / fused_ms);
    
    delete[] h_input;
    cudaFree(d_input); cudaFree(d_output);
    cudaFree(d_max); cudaFree(d_sum);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o fused_softmax fused_softmax.cu
!./fused_softmax

---

## Part 3: Fused Layer Normalization

### LayerNorm Formula

```
LayerNorm(x) = gamma * (x - mean) / sqrt(var + eps) + beta

Requires:
  - Mean computation (reduction)
  - Variance computation (reduction)
  - Normalization (elementwise)
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile fused_layernorm.cu
// fused_layernorm.cu - Fused layer normalization
#include <stdio.h>
#include <cuda_runtime.h>

// Welford's online algorithm for mean and variance
__inline__ __device__ void welford_update(
    float val, float& mean, float& m2, float& count
) {
    count += 1.0f;
    float delta = val - mean;
    mean += delta / count;
    float delta2 = val - mean;
    m2 += delta * delta2;
}

__inline__ __device__ void welford_combine(
    float mean_a, float m2_a, float count_a,
    float mean_b, float m2_b, float count_b,
    float& mean, float& m2, float& count
) {
    count = count_a + count_b;
    float delta = mean_b - mean_a;
    mean = mean_a + delta * count_b / count;
    m2 = m2_a + m2_b + delta * delta * count_a * count_b / count;
}

// Fused LayerNorm: one block per row
__global__ void fused_layernorm(
    float* input, float* output,
    float* gamma, float* beta,
    int rows, int cols, float eps
) {
    extern __shared__ float shared[];
    float* s_mean = shared;
    float* s_m2 = shared + blockDim.x;
    float* s_count = shared + 2 * blockDim.x;
    
    int row = blockIdx.x;
    int tid = threadIdx.x;
    
    if (row >= rows) return;
    
    float* row_in = input + row * cols;
    float* row_out = output + row * cols;
    
    // Each thread processes multiple elements
    float local_mean = 0.0f, local_m2 = 0.0f, local_count = 0.0f;
    
    for (int i = tid; i < cols; i += blockDim.x) {
        welford_update(row_in[i], local_mean, local_m2, local_count);
    }
    
    s_mean[tid] = local_mean;
    s_m2[tid] = local_m2;
    s_count[tid] = local_count;
    __syncthreads();
    
    // Reduction
    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
        if (tid < stride) {
            float new_mean, new_m2, new_count;
            welford_combine(
                s_mean[tid], s_m2[tid], s_count[tid],
                s_mean[tid + stride], s_m2[tid + stride], s_count[tid + stride],
                new_mean, new_m2, new_count
            );
            s_mean[tid] = new_mean;
            s_m2[tid] = new_m2;
            s_count[tid] = new_count;
        }
        __syncthreads();
    }
    
    float mean = s_mean[0];
    float var = s_m2[0] / s_count[0];
    float inv_std = rsqrtf(var + eps);
    
    // Apply normalization
    for (int i = tid; i < cols; i += blockDim.x) {
        float normalized = (row_in[i] - mean) * inv_std;
        row_out[i] = gamma[i] * normalized + beta[i];
    }
}

int main() {
    printf("=== Fused Layer Normalization ===\n\n");
    
    const int rows = 10000;
    const int cols = 768;  // BERT hidden size
    const float eps = 1e-5f;
    
    float *d_input, *d_output, *d_gamma, *d_beta;
    cudaMalloc(&d_input, rows * cols * sizeof(float));
    cudaMalloc(&d_output, rows * cols * sizeof(float));
    cudaMalloc(&d_gamma, cols * sizeof(float));
    cudaMalloc(&d_beta, cols * sizeof(float));
    
    // Initialize
    float* h_gamma = new float[cols];
    float* h_beta = new float[cols];
    for (int i = 0; i < cols; i++) { h_gamma[i] = 1.0f; h_beta[i] = 0.0f; }
    cudaMemcpy(d_gamma, h_gamma, cols * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_beta, h_beta, cols * sizeof(float), cudaMemcpyHostToDevice);
    
    int threads = 256;
    int shared_mem = 3 * threads * sizeof(float);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        fused_layernorm<<<rows, threads, shared_mem>>>(
            d_input, d_output, d_gamma, d_beta, rows, cols, eps
        );
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    printf("Configuration: %d rows x %d cols (BERT-like)\n", rows, cols);
    printf("Time: %.2f ms (100 iterations)\n", ms);
    printf("Throughput: %.2f GB/s\n", 
           2.0 * rows * cols * 4 * 100 / (ms * 1e6));
    
    delete[] h_gamma; delete[] h_beta;
    cudaFree(d_input); cudaFree(d_output);
    cudaFree(d_gamma); cudaFree(d_beta);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o fused_layernorm fused_layernorm.cu
!./fused_layernorm

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile fused_kernels_exercises.cu
// CUDA C++ Exercises - Kernel Fusion
#include <cuda_runtime.h>
#include <stdio.h>

// Exercise 1: Implement a fused bias + ReLU kernel
// TODO: Add your implementation here

// Exercise 2: Implement a fused normalize + scale kernel  
// TODO: Add your implementation here

// Exercise 3: Compare fused vs unfused performance
// TODO: Add benchmarking code here

int main() {
    printf("=== Kernel Fusion Exercises ===\n");
    printf("Implement the exercises above and run!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o fused_kernels_exercises fused_kernels_exercises.cu && ./fused_kernels_exercises

### üî∂ Python/Numba Exercises (Optional)

The following exercises use Python and Numba for rapid prototyping. Complete the CUDA C++ exercises above first for the primary learning objectives.

---

## Summary

### Kernel Fusion Benefits

| Metric | Unfused | Fused |
|--------|---------|-------|
| Kernel launches | N | 1 |
| Global memory access | High | Minimal |
| Launch overhead | Significant | Once |

### When to Fuse

‚úÖ **Good candidates**:
- Element-wise chains (add ‚Üí mul ‚Üí activation)
- Reduction + normalize (softmax, layernorm)
- Bias + activation

‚ùå **Poor candidates**:
- Large GEMM (already compute-bound)
- Operations with dependencies between rows