In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import math
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: Matrix Multiplication Basics

### The Algorithm

```
C = A √ó B

Where:
  A is M √ó K
  B is K √ó N
  C is M √ó N

Each element:
  C[i,j] = Œ£ A[i,k] * B[k,j]  for k = 0 to K-1

Example (2√ó3 @ 3√ó2 = 2√ó2):

A = [1 2 3]    B = [1 4]    C = [1*1+2*2+3*3  1*4+2*5+3*6] = [14 32]
    [4 5 6]        [2 5]        [4*1+5*2+6*3  4*4+5*5+6*6]   [32 77]
                   [3 6]
```

### Parallelization Strategy

```
Key insight: Each C[i,j] is INDEPENDENT!

GPU mapping:
‚Ä¢ One thread per output element
‚Ä¢ Thread (i, j) computes C[i,j]
‚Ä¢ 2D grid/block structure natural fit

Grid dimensions:
  gridDim.x = ceil(N / blockDim.x)
  gridDim.y = ceil(M / blockDim.y)
```

### üî∑ CUDA C++ Implementation (Primary)

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
%%writefile naive_matmul.cu
// naive_matmul.cu - Basic matrix multiplication
#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 16

// Naive matrix multiplication: C = A √ó B
// A: M√óK, B: K√óN, C: M√óN
__global__ void matmul_naive(const float* A, const float* B, float* C,
                              int M, int N, int K) {
    // Calculate row and column for this thread
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Bounds check
    if (row < M && col < N) {
        float sum = 0.0f;
        
        // Dot product of row of A and column of B
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        
        C[row * N + col] = sum;
    }
}

int main() {
    int M = 1024, K = 1024, N = 1024;
    size_t size_A = M * K * sizeof(float);
    size_t size_B = K * N * sizeof(float);
    size_t size_C = M * N * sizeof(float);
    
    // Allocate host memory
    float *h_A = (float*)malloc(size_A);
    float *h_B = (float*)malloc(size_B);
    float *h_C = (float*)malloc(size_C);
    
    // Initialize matrices
    for (int i = 0; i < M * K; i++) h_A[i] = rand() / (float)RAND_MAX;
    for (int i = 0; i < K * N; i++) h_B[i] = rand() / (float)RAND_MAX;
    
    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size_A);
    cudaMalloc(&d_B, size_B);
    cudaMalloc(&d_C, size_C);
    
    // Copy to device
    cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);
    
    // Launch kernel
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
              (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // Warmup
    matmul_naive<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    cudaDeviceSynchronize();
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    for (int i = 0; i < 10; i++) {
        matmul_naive<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    ms /= 10;  // Average
    
    // Calculate GFLOPS
    double flops = 2.0 * M * N * K;  // multiply + add
    double gflops = flops / (ms * 1e6);
    
    printf("Matrix size: %d x %d x %d\n", M, K, N);
    printf("Time: %.3f ms\n", ms);
    printf("Performance: %.2f GFLOPS\n", gflops);
    
    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o naive_matmul naive_matmul.cu
!./naive_matmul

In [None]:
@cuda.jit
def matmul_naive(A, B, C, M, N, K):
    """Naive matrix multiplication: C = A @ B"""
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    
    if row < M and col < N:
        total = 0.0
        for k in range(K):
            total += A[row, k] * B[k, col]
        C[row, col] = total

In [None]:
# Test naive matrix multiplication
M, K, N = 512, 512, 512

A = np.random.rand(M, K).astype(np.float32)
B = np.random.rand(K, N).astype(np.float32)
C = np.zeros((M, N), dtype=np.float32)

d_A = cuda.to_device(A)
d_B = cuda.to_device(B)
d_C = cuda.to_device(C)

BLOCK_SIZE = 16
grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE,
        (M + BLOCK_SIZE - 1) // BLOCK_SIZE)
block = (BLOCK_SIZE, BLOCK_SIZE)

matmul_naive[grid, block](d_A, d_B, d_C, M, N, K)
result = d_C.copy_to_host()

# Verify
expected = A @ B
print(f"Matrix sizes: A({M}√ó{K}) @ B({K}√ó{N}) = C({M}√ó{N})")
print(f"Correct: {'‚úì' if np.allclose(result, expected, rtol=1e-4) else '‚úó'}")
print(f"Max error: {np.max(np.abs(result - expected)):.6f}")

---

## Part 2: Memory Access Analysis

### Access Pattern Visualization

```
For C[row, col], thread reads:

From A:                    From B:
Row 'row' of A             Column 'col' of B
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê            ‚îå‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îê
‚îÇ             ‚îÇ            ‚îÇ  ‚îÇ  ‚îÇ  ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§ ‚Üê row      ‚îÇ  ‚îÇcol‚îÇ  ‚îÇ
‚îÇ * * * * * * ‚îÇ            ‚îÇ  ‚îÇ ‚Üì‚îÇ  ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§            ‚îÇ  ‚îÇ * ‚îÇ  ‚îÇ
‚îÇ             ‚îÇ            ‚îÇ  ‚îÇ * ‚îÇ  ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò            ‚îÇ  ‚îÇ * ‚îÇ  ‚îÇ
                           ‚îî‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îò
```

### Memory Coalescing

```
Adjacent threads in a warp (same row, consecutive cols):

Thread 0: C[row, 0]  ‚Üí reads A[row, 0:K], B[0:K, 0]
Thread 1: C[row, 1]  ‚Üí reads A[row, 0:K], B[0:K, 1]
Thread 2: C[row, 2]  ‚Üí reads A[row, 0:K], B[0:K, 2]
...

A access: All threads read SAME row ‚Üí broadcast (efficient)
B access: Threads read consecutive columns ‚Üí COALESCED (efficient)

But HUGE redundancy:
‚Ä¢ Each row of A read N times (once per column of C)
‚Ä¢ Each column of B read M times (once per row of C)
```

In [None]:
def analyze_memory_traffic(M, K, N):
    """Analyze memory traffic for naive matrix multiply."""
    
    # Each C element reads entire row of A and column of B
    reads_per_output = K + K  # K elements from A, K from B
    total_reads = M * N * reads_per_output
    
    # Theoretical minimum (each A, B element read once)
    min_reads = M * K + K * N
    
    # Output writes
    writes = M * N
    
    # Bytes (float32)
    bytes_read = total_reads * 4
    bytes_min = min_reads * 4
    bytes_write = writes * 4
    
    print(f"Matrix multiply: ({M}√ó{K}) @ ({K}√ó{N}) = ({M}√ó{N})")
    print(f"\nMemory reads:")
    print(f"  Naive:    {bytes_read / 1e9:.2f} GB")
    print(f"  Minimum:  {bytes_min / 1e6:.2f} MB")
    print(f"  Overhead: {bytes_read / bytes_min:.0f}x")
    print(f"\nOperations: {2 * M * N * K / 1e9:.2f} GFLOP")
    print(f"Arithmetic intensity (naive): {2 * M * N * K / bytes_read:.2f} FLOP/byte")

analyze_memory_traffic(1024, 1024, 1024)

---

## Part 3: Performance Benchmark

In [None]:
def benchmark_matmul(M, K, N, iterations=20):
    """Benchmark naive matrix multiplication."""
    A = np.random.rand(M, K).astype(np.float32)
    B = np.random.rand(K, N).astype(np.float32)
    C = np.zeros((M, N), dtype=np.float32)
    
    d_A = cuda.to_device(A)
    d_B = cuda.to_device(B)
    d_C = cuda.to_device(C)
    
    BLOCK_SIZE = 16
    grid = ((N + BLOCK_SIZE - 1) // BLOCK_SIZE,
            (M + BLOCK_SIZE - 1) // BLOCK_SIZE)
    block = (BLOCK_SIZE, BLOCK_SIZE)
    
    # Warmup
    matmul_naive[grid, block](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    
    # Benchmark
    start = time.perf_counter()
    for _ in range(iterations):
        matmul_naive[grid, block](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    elapsed = (time.perf_counter() - start) / iterations * 1000  # ms
    
    # Calculate GFLOPS
    flops = 2 * M * N * K
    gflops = flops / (elapsed * 1e6)
    
    return elapsed, gflops

print(f"{'Size':<15} {'Time (ms)':<12} {'GFLOPS':<12}")
print("=" * 40)

for size in [256, 512, 1024, 2048]:
    try:
        ms, gflops = benchmark_matmul(size, size, size)
        print(f"{size}√ó{size}√ó{size:<8} {ms:<12.3f} {gflops:<12.2f}")
    except Exception as e:
        print(f"{size}√ó{size}√ó{size:<8} Error: {e}")

---

## Part 4: Why Naive is Slow

### The Problem

```
For 1024√ó1024 matrices:

Operations:     2 √ó 1024¬≥ = 2.1 billion FLOPS
Naive reads:    1024¬≤ √ó 2K = 2.1 billion reads
Arithmetic intensity: 1 FLOP per read = TERRIBLE

GPU peak: ~10 TFLOPS, Memory BW: ~500 GB/s
Required BW for 10 TFLOPS: 10 TB/s (20x more than available!)

Result: Memory-bound at ~10% of peak
```

### The Solution Preview

```
TILING with shared memory:

1. Load tiles of A and B into shared memory
2. Compute partial products using fast shared memory
3. Repeat for all tiles

With 32√ó32 tiles:
  Each element loaded once per tile, reused 32 times
  32x reduction in global memory traffic!
```

In [None]:
# Compare with NumPy (uses optimized BLAS)
def compare_with_numpy(sizes):
    print(f"{'Size':<12} {'Naive (ms)':<12} {'NumPy (ms)':<12} {'Ratio':<10}")
    print("=" * 50)
    
    for size in sizes:
        A = np.random.rand(size, size).astype(np.float32)
        B = np.random.rand(size, size).astype(np.float32)
        
        # NumPy
        start = time.perf_counter()
        for _ in range(5):
            _ = A @ B
        numpy_ms = (time.perf_counter() - start) / 5 * 1000
        
        # CUDA naive
        cuda_ms, _ = benchmark_matmul(size, size, size, iterations=5)
        
        print(f"{size}√ó{size:<8} {cuda_ms:<12.3f} {numpy_ms:<12.3f} {cuda_ms/numpy_ms:.1f}x")

compare_with_numpy([256, 512, 1024])

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile matmul_exercises.cu
/*
 * CUDA C++ Matrix Multiplication Exercises
 * Exercise 1: Rectangular Matrices - Handle M√óK √ó K√óN with M != K != N
 * Exercise 2: Block Size Tuning - Compare 8√ó8, 16√ó16, 32√ó32 block sizes
 */

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <chrono>

#define CHECK_CUDA(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(1); \
    } \
}

// Exercise 1: Naive matmul that handles rectangular matrices
__global__ void matmul_rectangular(const float* A, const float* B, float* C, 
                                   int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

// Helper function to run matmul with specific block size (for Exercise 2)
template<int BLOCK_SIZE>
float benchmark_block_size(const float* d_A, const float* d_B, float* d_C,
                           int M, int N, int K, int iterations) {
    dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // Warmup
    matmul_rectangular<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    cudaDeviceSynchronize();
    
    // Benchmark
    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < iterations; i++) {
        matmul_rectangular<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    }
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();
    
    float ms = std::chrono::duration<float, std::milli>(end - start).count() / iterations;
    return ms;
}

void verify_result(float* C, float* C_ref, int size, const char* test_name) {
    float max_error = 0.0f;
    for (int i = 0; i < size; i++) {
        float error = fabs(C[i] - C_ref[i]);
        if (error > max_error) max_error = error;
    }
    printf("  %s: Max error = %.6f %s\n", test_name, max_error, 
           max_error < 0.01f ? "‚úì" : "‚úó");
}

void cpu_matmul(const float* A, const float* B, float* C, int M, int N, int K) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
            for (int k = 0; k < K; k++) {
                sum += A[i * K + k] * B[k * N + j];
            }
            C[i * N + j] = sum;
        }
    }
}

int main() {
    printf("=== Exercise 1: Rectangular Matrices ===\n\n");
    
    // Test cases with different rectangular dimensions
    int test_cases[][3] = {
        {128, 256, 192},   // M √ó K, K √ó N  ‚Üí C is M √ó N
        {256, 128, 512},
        {512, 64, 256},
        {100, 200, 150}    // Non-power-of-2
    };
    
    for (int t = 0; t < 4; t++) {
        int M = test_cases[t][0];
        int K = test_cases[t][1];
        int N = test_cases[t][2];
        
        printf("Test %d: A(%d√ó%d) √ó B(%d√ó%d) = C(%d√ó%d)\n", t+1, M, K, K, N, M, N);
        
        // Allocate host memory
        float *h_A = (float*)malloc(M * K * sizeof(float));
        float *h_B = (float*)malloc(K * N * sizeof(float));
        float *h_C = (float*)malloc(M * N * sizeof(float));
        float *h_C_ref = (float*)malloc(M * N * sizeof(float));
        
        // Initialize with random data
        for (int i = 0; i < M * K; i++) h_A[i] = (float)rand() / RAND_MAX;
        for (int i = 0; i < K * N; i++) h_B[i] = (float)rand() / RAND_MAX;
        
        // CPU reference
        cpu_matmul(h_A, h_B, h_C_ref, M, N, K);
        
        // GPU computation
        float *d_A, *d_B, *d_C;
        CHECK_CUDA(cudaMalloc(&d_A, M * K * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_B, K * N * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_C, M * N * sizeof(float)));
        
        CHECK_CUDA(cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice));
        
        dim3 block(16, 16);
        dim3 grid((N + 15) / 16, (M + 15) / 16);
        matmul_rectangular<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
        
        CHECK_CUDA(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
        
        verify_result(h_C, h_C_ref, M * N, "GPU vs CPU");
        
        // Cleanup
        cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
        free(h_A); free(h_B); free(h_C); free(h_C_ref);
    }
    
    printf("\n=== Exercise 2: Block Size Tuning ===\n\n");
    
    int M = 1024, K = 1024, N = 1024;
    int iterations = 10;
    
    // Allocate
    float *h_A = (float*)malloc(M * K * sizeof(float));
    float *h_B = (float*)malloc(K * N * sizeof(float));
    float *d_A, *d_B, *d_C;
    
    for (int i = 0; i < M * K; i++) h_A[i] = (float)rand() / RAND_MAX;
    for (int i = 0; i < K * N; i++) h_B[i] = (float)rand() / RAND_MAX;
    
    CHECK_CUDA(cudaMalloc(&d_A, M * K * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_B, K * N * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&d_C, M * N * sizeof(float)));
    
    CHECK_CUDA(cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice));
    
    printf("Matrix size: %d √ó %d √ó %d\n\n", M, K, N);
    printf("%-15s %-15s %-15s\n", "Block Size", "Time (ms)", "GFLOPS");
    printf("%-15s %-15s %-15s\n", "-----------", "-----------", "-----------");
    
    float flops = 2.0f * M * N * K;  // 2 ops per multiply-add
    
    // Test 8√ó8 blocks
    float time_8 = benchmark_block_size<8>(d_A, d_B, d_C, M, N, K, iterations);
    printf("%-15s %-15.3f %-15.2f\n", "8 √ó 8", time_8, (flops / time_8) / 1e6);
    
    // Test 16√ó16 blocks
    float time_16 = benchmark_block_size<16>(d_A, d_B, d_C, M, N, K, iterations);
    printf("%-15s %-15.3f %-15.2f\n", "16 √ó 16", time_16, (flops / time_16) / 1e6);
    
    // Test 32√ó32 blocks
    float time_32 = benchmark_block_size<32>(d_A, d_B, d_C, M, N, K, iterations);
    printf("%-15s %-15.3f %-15.2f\n", "32 √ó 32", time_32, (flops / time_32) / 1e6);
    
    printf("\nObservations:\n");
    printf("- Larger blocks often perform better due to better cache utilization\n");
    printf("- 32√ó32 uses 1024 threads/block (maximum for many GPUs)\n");
    printf("- Optimal block size depends on GPU architecture and matrix size\n");
    
    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o matmul_exercises matmul_exercises.cu && ./matmul_exercises

### üî∂ Python/Numba Exercises (Optional)

### Exercise 1: Rectangular Matrices

In [None]:
# TODO: Test with non-square matrices
# Verify that the kernel handles M != K != N correctly

test_cases = [
    (100, 200, 150),  # M √ó K √ó N
    (500, 100, 300),
    (64, 1024, 64),
]

for M, K, N in test_cases:
    # Your code here
    pass

### Exercise 2: Block Size Tuning

In [None]:
# TODO: Try different block sizes and measure performance
# Try: 8√ó8, 16√ó16, 32√ó32

block_sizes = [8, 16, 32]

# Your benchmarking code here

---

## Summary

### Naive Matrix Multiplication

| Aspect | Value |
|--------|-------|
| Parallelization | One thread per output element |
| Memory access | Huge redundancy (each element read N or M times) |
| Arithmetic intensity | ~1 FLOP/byte (very low) |
| Performance | ~10% of peak (memory-bound) |

### CUDA C++ Key Pattern

```cpp
__global__ void matmul_naive(float* A, float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}
```

### Next: Tiled Matrix Multiplication
Tomorrow we'll use shared memory tiling to dramatically improve performance!