In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import math
import time

print("⚠️  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: Matrix Multiplication Basics

### The Algorithm

```
C = A × B

Where:
  A is M × K
  B is K × N
  C is M × N

Each element:
  C[i,j] = Σ A[i,k] * B[k,j]  for k = 0 to K-1

Example (2×3 @ 3×2 = 2×2):

A = [1 2 3]    B = [1 4]    C = [1*1+2*2+3*3  1*4+2*5+3*6] = [14 32]
    [4 5 6]        [2 5]        [4*1+5*2+6*3  4*4+5*5+6*6]   [32 77]
                   [3 6]
```

### Parallelization Strategy

```
Key insight: Each C[i,j] is INDEPENDENT!

GPU mapping:
• One thread per output element
• Thread (i, j) computes C[i,j]
• 2D grid/block structure natural fit

Grid dimensions:
  gridDim.x = ceil(N / blockDim.x)
  gridDim.y = ceil(M / blockDim.y)
```

### CUDA C++ Implementation (Primary)

### Python/Numba (Optional)

In [None]:
%%writefile naive_matmul.cu
// naive_matmul.cu - Basic matrix multiplication
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16

// Naive matrix multiplication: C = A × B
// A: M×K, B: K×N, C: M×N
__global__ void matmul_naive(const float* A, const float* B, float* C,
                              int M, int N, int K) {
    // Calculate row and column for this thread
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Bounds check
    if (row < M && col < N) {
        float sum = 0.0f;
        
        // Dot product of row of A and column of B
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        
        C[row * N + col] = sum;
    }
}

int main() {
    int M = 1024, K = 1024, N = 1024;
    size_t size_A = M * K * sizeof(float);
    size_t size_B = K * N * sizeof(float);
    size_t size_C = M * N * sizeof(float);
    
    // Allocate host memory
    float *h_A = (float*)malloc(size_A);
    float *h_B = (float*)malloc(size_B);
    float *h_C = (float*)malloc(size_C);
    
    // Initialize matrices
    for (int i = 0; i < M * K; i++) h_A[i] = rand() / (float)RAND_MAX;
    for (int i = 0; i < K * N; i++) h_B[i] = rand() / (float)RAND_MAX;
    
    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size_A);
    cudaMalloc(&d_B, size_B);
    cudaMalloc(&d_C, size_C);
    
    // Copy to device
    cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);
    
    // Launch kernel
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
              (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // Warmup
    matmul_naive<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    cudaDeviceSynchronize();
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    for (int i = 0; i < 10; i++) {
        matmul_naive<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    ms /= 10;  // Average
    
    // Calculate GFLOPS
    double flops = 2.0 * M * N * K;  // multiply + add
    double gflops = flops / (ms * 1e6);
    
    printf("Matrix size: %d x %d x %d\n", M, K, N);
    printf("Time: %.3f ms\n", ms);
    printf("Performance: %.2f GFLOPS\n", gflops);
    
    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o naive_matmul naive_matmul.cu
!./naive_matmul

In [None]:
@cuda.jit
def matmul_naive(A, B, C, M, N, K):
    """Naive matrix multiplication: C = A @ B"""
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    
    if row < M and col < N:
        total = 0.0
        for k in range(K):
            total += A[row, k] * B[k, col]
        C[row, col] = total

In [None]:
# Test naive matrix multiplication
M, K, N = 512, 512, 512

A = np.random.rand(M, K).astype(np.float32)
B = np.random.rand(K, N).astype(np.float32)
C = np.zeros((M, N), dtype=np.float32)

d_A = cuda.to_device(A)
d_B = cuda.to_device(B)
d_C = cuda.to_device(C)

BLOCK_SIZE = 16
grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE,
        (M + BLOCK_SIZE - 1) // BLOCK_SIZE)
block = (BLOCK_SIZE, BLOCK_SIZE)

matmul_naive[grid, block](d_A, d_B, d_C, M, N, K)
result = d_C.copy_to_host()

# Verify
expected = A @ B
print(f"Matrix sizes: A({M}×{K}) @ B({K}×{N}) = C({M}×{N})")
print(f"Correct: {'✓' if np.allclose(result, expected, rtol=1e-4) else '✗'}")
print(f"Max error: {np.max(np.abs(result - expected)):.6f}")

---

## Part 2: Memory Access Analysis

### Access Pattern Visualization

```
For C[row, col], thread reads:

From A:                    From B:
Row 'row' of A             Column 'col' of B
┌─────────────┐            ┌──┬──┬──┐
│             │            │  │  │  │
├─────────────┤ ← row      │  │col│  │
│ * * * * * * │            │  │ ↓│  │
├─────────────┤            │  │ * │  │
│             │            │  │ * │  │
└─────────────┘            │  │ * │  │
                           └──┴──┴──┘
```

### Memory Coalescing

```
Adjacent threads in a warp (same row, consecutive cols):

Thread 0: C[row, 0]  → reads A[row, 0:K], B[0:K, 0]
Thread 1: C[row, 1]  → reads A[row, 0:K], B[0:K, 1]
Thread 2: C[row, 2]  → reads A[row, 0:K], B[0:K, 2]
...

A access: All threads read SAME row → broadcast (efficient)
B access: Threads read consecutive columns → COALESCED (efficient)

But HUGE redundancy:
• Each row of A read N times (once per column of C)
• Each column of B read M times (once per row of C)
```

In [None]:
def analyze_memory_traffic(M, K, N):
    """Analyze memory traffic for naive matrix multiply."""
    
    # Each C element reads entire row of A and column of B
    reads_per_output = K + K  # K elements from A, K from B
    total_reads = M * N * reads_per_output
    
    # Theoretical minimum (each A, B element read once)
    min_reads = M * K + K * N
    
    # Output writes
    writes = M * N
    
    # Bytes (float32)
    bytes_read = total_reads * 4
    bytes_min = min_reads * 4
    bytes_write = writes * 4
    
    print(f"Matrix multiply: ({M}×{K}) @ ({K}×{N}) = ({M}×{N})")
    print(f"\nMemory reads:")
    print(f"  Naive:    {bytes_read / 1e9:.2f} GB")
    print(f"  Minimum:  {bytes_min / 1e6:.2f} MB")
    print(f"  Overhead: {bytes_read / bytes_min:.0f}x")
    print(f"\nOperations: {2 * M * N * K / 1e9:.2f} GFLOP")
    print(f"Arithmetic intensity (naive): {2 * M * N * K / bytes_read:.2f} FLOP/byte")

analyze_memory_traffic(1024, 1024, 1024)

---

## Part 3: Performance Benchmark

In [None]:
def benchmark_matmul(M, K, N, iterations=20):
    """Benchmark naive matrix multiplication."""
    A = np.random.rand(M, K).astype(np.float32)
    B = np.random.rand(K, N).astype(np.float32)
    C = np.zeros((M, N), dtype=np.float32)
    
    d_A = cuda.to_device(A)
    d_B = cuda.to_device(B)
    d_C = cuda.to_device(C)
    
    BLOCK_SIZE = 16
    grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE,
            (M + BLOCK_SIZE - 1) // BLOCK_SIZE)
    block = (BLOCK_SIZE, BLOCK_SIZE)
    
    # Warmup
    matmul_naive[grid, block](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    
    # Benchmark
    start = time.perf_counter()
    for _ in range(iterations):
        matmul_naive[grid, block](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    elapsed = (time.perf_counter() - start) / iterations * 1000  # ms
    
    # Calculate GFLOPS
    flops = 2 * M * N * K
    gflops = flops / (elapsed * 1e6)
    
    return elapsed, gflops

print(f"{'Size':<15} {'Time (ms)':<12} {'GFLOPS':<12}")
print("=" * 40)

for size in [256, 512, 1024, 2048]:
    try:
        ms, gflops = benchmark_matmul(size, size, size)
        print(f"{size}×{size}×{size:<8} {ms:<12.3f} {gflops:<12.2f}")
    except Exception as e:
        print(f"{size}×{size}×{size:<8} Error: {e}")

---

## Part 4: Why Naive is Slow

### The Problem

```
For 1024×1024 matrices:

Operations:     2 × 1024³ = 2.1 billion FLOPS
Naive reads:    1024² × 2K = 2.1 billion reads
Arithmetic intensity: 1 FLOP per read = TERRIBLE

GPU peak: ~10 TFLOPS, Memory BW: ~500 GB/s
Required BW for 10 TFLOPS: 10 TB/s (20x more than available!)

Result: Memory-bound at ~10% of peak
```

### The Solution Preview

```
TILING with shared memory:

1. Load tiles of A and B into shared memory
2. Compute partial products using fast shared memory
3. Repeat for all tiles

With 32×32 tiles:
  Each element loaded once per tile, reused 32 times
  32x reduction in global memory traffic!
```

In [None]:
# Compare with NumPy (uses optimized BLAS)
def compare_with_numpy(sizes):
    print(f"{'Size':<12} {'Naive (ms)':<12} {'NumPy (ms)':<12} {'Ratio':<10}")
    print("=" * 50)
    
    for size in sizes:
        A = np.random.rand(size, size).astype(np.float32)
        B = np.random.rand(size, size).astype(np.float32)
        
        # NumPy
        start = time.perf_counter()
        for _ in range(5):
            _ = A @ B
        numpy_ms = (time.perf_counter() - start) / 5 * 1000
        
        # CUDA naive
        cuda_ms, _ = benchmark_matmul(size, size, size, iterations=5)
        
        print(f"{size}×{size:<8} {cuda_ms:<12.3f} {numpy_ms:<12.3f} {cuda_ms/numpy_ms:.1f}x")

compare_with_numpy([256, 512, 1024])

---

## Exercises

### Exercise 1: Rectangular Matrices

In [None]:
# TODO: Test with non-square matrices
# Verify that the kernel handles M != K != N correctly

test_cases = [
    (100, 200, 150),  # M × K × N
    (500, 100, 300),
    (64, 1024, 64),
]

for M, K, N in test_cases:
    # Your code here
    pass

### Exercise 2: Block Size Tuning

In [None]:
# TODO: Try different block sizes and measure performance
# Try: 8×8, 16×16, 32×32

block_sizes = [8, 16, 32]

# Your benchmarking code here

---

## Summary

### Naive Matrix Multiplication

| Aspect | Value |
|--------|-------|
| Parallelization | One thread per output element |
| Memory access | Huge redundancy (each element read N or M times) |
| Arithmetic intensity | ~1 FLOP/byte (very low) |
| Performance | ~10% of peak (memory-bound) |

### CUDA C++ Key Pattern

```cpp
__global__ void matmul_naive(float* A, float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}
```

### Next: Tiled Matrix Multiplication
Tomorrow we'll use shared memory tiling to dramatically improve performance!