In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import math
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: The Tiling Strategy

### Why Tiling Works

```
Key insight: Data REUSE

In naive approach:
  Thread (i, j) reads A[i, :] and B[:, j]
  Thread (i, j+1) also reads A[i, :] ‚Üê SAME DATA!
  Thread (i+1, j) also reads B[:, j] ‚Üê SAME DATA!

Solution: Load tiles into SHARED MEMORY
  All threads in block share the data
  Each element loaded once, used TILE_SIZE times
```

### Tiling Visualization

```
Matrix A (M√óK)              Matrix B (K√óN)
‚îå‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îê          ‚îå‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îê
‚îÇ T ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ          ‚îÇ T ‚îÇ   ‚îÇ   ‚îÇ
‚îÇ i ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ          ‚îÇ i ‚îÇ   ‚îÇ   ‚îÇ
‚îÇ l ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ          ‚îÇ l ‚îÇ   ‚îÇ   ‚îÇ
‚îÇ e ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ          ‚îÇ e ‚îÇ   ‚îÇ   ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚î§          ‚îú‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚î§
‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ          ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îò          ‚îî‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îò

Block (bx, by) computes C[by*T:(by+1)*T, bx*T:(bx+1)*T]

For each tile t along K dimension:
  1. Load A[by*T:(by+1)*T, t*T:(t+1)*T] ‚Üí shared As
  2. Load B[t*T:(t+1)*T, bx*T:(bx+1)*T] ‚Üí shared Bs
  3. Compute partial products: As √ó Bs
  4. Accumulate to result
```

### üî∑ CUDA C++ Implementation (Primary)

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
%%writefile tiled_matmul.cu
// tiled_matmul.cu - Shared memory tiled matrix multiplication
#include <stdio.h>
#include <cuda_runtime.h>

#define TILE_SIZE 32

// Tiled matrix multiplication: C = A √ó B
__global__ void matmul_tiled(const float* A, const float* B, float* C,
                              int M, int N, int K) {
    // Shared memory for tiles
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];
    
    // Thread's position in output
    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;
    
    float sum = 0.0f;
    
    // Loop over tiles along K dimension
    int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
    
    for (int t = 0; t < numTiles; t++) {
        // Load tile of A into shared memory
        int a_col = t * TILE_SIZE + threadIdx.x;
        if (row < M && a_col < K) {
            As[threadIdx.y][threadIdx.x] = A[row * K + a_col];
        } else {
            As[threadIdx.y][threadIdx.x] = 0.0f;
        }
        
        // Load tile of B into shared memory
        int b_row = t * TILE_SIZE + threadIdx.y;
        if (b_row < K && col < N) {
            Bs[threadIdx.y][threadIdx.x] = B[b_row * N + col];
        } else {
            Bs[threadIdx.y][threadIdx.x] = 0.0f;
        }
        
        // Wait for all threads to finish loading
        __syncthreads();
        
        // Compute partial products
        for (int k = 0; k < TILE_SIZE; k++) {
            sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
        }
        
        // Wait before loading next tile
        __syncthreads();
    }
    
    // Write result
    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

int main() {
    int M = 1024, K = 1024, N = 1024;
    size_t size_A = M * K * sizeof(float);
    size_t size_B = K * N * sizeof(float);
    size_t size_C = M * N * sizeof(float);
    
    float *h_A = (float*)malloc(size_A);
    float *h_B = (float*)malloc(size_B);
    float *h_C = (float*)malloc(size_C);
    
    for (int i = 0; i < M * K; i++) h_A[i] = rand() / (float)RAND_MAX;
    for (int i = 0; i < K * N; i++) h_B[i] = rand() / (float)RAND_MAX;
    
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size_A);
    cudaMalloc(&d_B, size_B);
    cudaMalloc(&d_C, size_C);
    
    cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);
    
    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE,
              (M + TILE_SIZE - 1) / TILE_SIZE);
    
    // Warmup
    matmul_tiled<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    cudaDeviceSynchronize();
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    for (int i = 0; i < 10; i++) {
        matmul_tiled<<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    ms /= 10;
    
    double flops = 2.0 * M * N * K;
    double gflops = flops / (ms * 1e6);
    
    printf("Tiled MatMul (%dx%d): %.3f ms, %.2f GFLOPS\n", TILE_SIZE, TILE_SIZE, ms, gflops);
    
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o tiled_matmul tiled_matmul.cu
!./tiled_matmul

In [None]:
TILE_SIZE = 16  # Must match block size

@cuda.jit
def matmul_tiled(A, B, C, M, N, K):
    """Tiled matrix multiplication using shared memory."""
    # Shared memory tiles
    As = cuda.shared.array((TILE_SIZE, TILE_SIZE), dtype=np.float32)
    Bs = cuda.shared.array((TILE_SIZE, TILE_SIZE), dtype=np.float32)
    
    # Thread indices
    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    
    # Global row and column
    row = cuda.blockIdx.y * TILE_SIZE + ty
    col = cuda.blockIdx.x * TILE_SIZE + tx
    
    # Accumulator
    total = 0.0
    
    # Number of tiles along K
    num_tiles = (K + TILE_SIZE - 1) // TILE_SIZE
    
    for t in range(num_tiles):
        # Load tile of A
        a_col = t * TILE_SIZE + tx
        if row < M and a_col < K:
            As[ty, tx] = A[row, a_col]
        else:
            As[ty, tx] = 0.0
        
        # Load tile of B
        b_row = t * TILE_SIZE + ty
        if b_row < K and col < N:
            Bs[ty, tx] = B[b_row, col]
        else:
            Bs[ty, tx] = 0.0
        
        # Synchronize
        cuda.syncthreads()
        
        # Compute partial product
        for k in range(TILE_SIZE):
            total += As[ty, k] * Bs[k, tx]
        
        # Synchronize before next tile
        cuda.syncthreads()
    
    # Write result
    if row < M and col < N:
        C[row, col] = total

In [None]:
# Test tiled matrix multiplication
M, K, N = 512, 512, 512

A = np.random.rand(M, K).astype(np.float32)
B = np.random.rand(K, N).astype(np.float32)
C = np.zeros((M, N), dtype=np.float32)

d_A = cuda.to_device(A)
d_B = cuda.to_device(B)
d_C = cuda.to_device(C)

grid = ((N + TILE_SIZE - 1) // TILE_SIZE,
        (M + TILE_SIZE - 1) // TILE_SIZE)
block = (TILE_SIZE, TILE_SIZE)

matmul_tiled[grid, block](d_A, d_B, d_C, M, N, K)
result = d_C.copy_to_host()

expected = A @ B
print(f"Tiled MatMul: {M}√ó{K} @ {K}√ó{N}")
print(f"Correct: {'‚úì' if np.allclose(result, expected, rtol=1e-4) else '‚úó'}")
print(f"Max error: {np.max(np.abs(result - expected)):.6f}")

---

## Part 2: Memory Traffic Reduction

In [None]:
def analyze_tiled_traffic(M, K, N, tile_size):
    """Analyze memory traffic for tiled matrix multiply."""
    
    # Number of blocks
    blocks_m = (M + tile_size - 1) // tile_size
    blocks_n = (N + tile_size - 1) // tile_size
    blocks_k = (K + tile_size - 1) // tile_size
    
    # Each block loads tiles along K dimension
    # A tile: tile_size √ó tile_size per block √ó blocks_k times
    # Each A tile loaded by blocks_n column-blocks
    # Each B tile loaded by blocks_m row-blocks
    
    # Total A loads: M/T √ó K/T tiles √ó T¬≤ elements √ó N/T blocks
    # But wait - each A tile loaded only N/T times, not N times!
    
    # Tiled loads:
    a_loads = blocks_m * blocks_k * blocks_n * tile_size * tile_size
    b_loads = blocks_m * blocks_k * blocks_n * tile_size * tile_size
    tiled_total = (a_loads + b_loads) * 4  # bytes
    
    # Actually, each block loads its own tiles:
    # Each block: 2 √ó K/T √ó T¬≤ loads
    # Total blocks: (M/T) √ó (N/T)
    actual_tiled = blocks_m * blocks_n * 2 * blocks_k * tile_size * tile_size * 4
    
    # Naive loads (for comparison)
    naive_total = M * N * (K + K) * 4
    
    # Minimum possible
    min_loads = (M * K + K * N) * 4
    
    print(f"Matrix multiply: ({M}√ó{K}) @ ({K}√ó{N}), Tile size: {tile_size}")
    print(f"\nMemory traffic comparison:")
    print(f"  Naive:    {naive_total / 1e9:.2f} GB")
    print(f"  Tiled:    {actual_tiled / 1e9:.3f} GB")
    print(f"  Minimum:  {min_loads / 1e6:.2f} MB")
    print(f"\nReduction factor: {naive_total / actual_tiled:.1f}x")
    print(f"Theoretical max:  {tile_size:.0f}x")

analyze_tiled_traffic(1024, 1024, 1024, 32)

---

## Part 3: Performance Comparison

In [None]:
# Naive kernel for comparison
@cuda.jit
def matmul_naive(A, B, C, M, N, K):
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    
    if row < M and col < N:
        total = 0.0
        for k in range(K):
            total += A[row, k] * B[k, col]
        C[row, col] = total

In [None]:
def benchmark_comparison(M, K, N, iterations=20):
    """Compare naive vs tiled performance."""
    A = np.random.rand(M, K).astype(np.float32)
    B = np.random.rand(K, N).astype(np.float32)
    C = np.zeros((M, N), dtype=np.float32)
    
    d_A = cuda.to_device(A)
    d_B = cuda.to_device(B)
    d_C = cuda.to_device(C)
    
    flops = 2 * M * N * K
    
    # Naive
    block_naive = (16, 16)
    grid_naive = ((N + 15) // 16, (M + 15) // 16)
    
    matmul_naive[grid_naive, block_naive](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    
    start = time.perf_counter()
    for _ in range(iterations):
        matmul_naive[grid_naive, block_naive](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    naive_ms = (time.perf_counter() - start) / iterations * 1000
    naive_gflops = flops / (naive_ms * 1e6)
    
    # Tiled
    block_tiled = (TILE_SIZE, TILE_SIZE)
    grid_tiled = ((N + TILE_SIZE - 1) // TILE_SIZE, 
                  (M + TILE_SIZE - 1) // TILE_SIZE)
    
    matmul_tiled[grid_tiled, block_tiled](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    
    start = time.perf_counter()
    for _ in range(iterations):
        matmul_tiled[grid_tiled, block_tiled](d_A, d_B, d_C, M, N, K)
    cuda.synchronize()
    tiled_ms = (time.perf_counter() - start) / iterations * 1000
    tiled_gflops = flops / (tiled_ms * 1e6)
    
    return naive_ms, naive_gflops, tiled_ms, tiled_gflops

print(f"{'Size':<15} {'Naive (ms)':<12} {'Naive GFLOPS':<14} {'Tiled (ms)':<12} {'Tiled GFLOPS':<14} {'Speedup':<10}")
print("=" * 85)

for size in [256, 512, 1024]:
    try:
        n_ms, n_gf, t_ms, t_gf = benchmark_comparison(size, size, size)
        speedup = n_ms / t_ms
        print(f"{size}√ó{size}√ó{size:<8} {n_ms:<12.3f} {n_gf:<14.2f} {t_ms:<12.3f} {t_gf:<14.2f} {speedup:.2f}x")
    except Exception as e:
        print(f"{size}√ó{size}√ó{size:<8} Error: {e}")

---

## Part 4: Tile Size Selection

In [None]:
def analyze_tile_sizes():
    """Analyze trade-offs of different tile sizes."""
    print("Tile Size Analysis:")
    print("=" * 60)
    
    for tile in [8, 16, 32]:
        threads = tile * tile
        shared_mem = 2 * tile * tile * 4  # Two tiles, float32
        reuse = tile  # Each element reused tile times
        
        # Occupancy considerations
        max_blocks_shared = 48 * 1024 // shared_mem  # 48KB typical
        max_blocks_threads = 1024 // threads  # 1024 threads/block max
        
        print(f"\nTile {tile}√ó{tile}:")
        print(f"  Threads per block: {threads}")
        print(f"  Shared memory: {shared_mem} bytes ({shared_mem/1024:.1f} KB)")
        print(f"  Data reuse factor: {reuse}x")
        print(f"  Max blocks (shared mem limit): {max_blocks_shared}")
        print(f"  Max blocks (thread limit): {max_blocks_threads}")

analyze_tile_sizes()

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile tiled_matmul_exercises.cu
/*
 * CUDA C++ Tiled Matrix Multiplication Exercises
 * Exercise 1: Variable Tile Size - Compare 8√ó8, 16√ó16, 32√ó32 tiles
 * Exercise 2: Rectangular Tiles - Implement non-square tiles
 */

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <chrono>

#define CHECK_CUDA(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
        exit(1); \
    } \
}

// Exercise 1: Tiled matmul with template tile size
template<int TILE_SIZE>
__global__ void matmul_tiled(const float* A, const float* B, float* C, 
                              int M, int N, int K) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];
    
    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;
    
    int row = by * TILE_SIZE + ty;
    int col = bx * TILE_SIZE + tx;
    
    float sum = 0.0f;
    int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
    
    for (int t = 0; t < numTiles; t++) {
        // Load A tile with bounds check
        int a_col = t * TILE_SIZE + tx;
        if (row < M && a_col < K)
            As[ty][tx] = A[row * K + a_col];
        else
            As[ty][tx] = 0.0f;
        
        // Load B tile with bounds check
        int b_row = t * TILE_SIZE + ty;
        if (b_row < K && col < N)
            Bs[ty][tx] = B[b_row * N + col];
        else
            Bs[ty][tx] = 0.0f;
        
        __syncthreads();
        
        // Compute partial product
        #pragma unroll
        for (int k = 0; k < TILE_SIZE; k++) {
            sum += As[ty][k] * Bs[k][tx];
        }
        
        __syncthreads();
    }
    
    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

// Exercise 2: Rectangular tiles (TILE_M √ó TILE_K for A, TILE_K √ó TILE_N for B)
template<int TILE_M, int TILE_N, int TILE_K>
__global__ void matmul_tiled_rect(const float* A, const float* B, float* C,
                                   int M, int N, int K) {
    // Shared memory for rectangular tiles
    __shared__ float As[TILE_M][TILE_K];
    __shared__ float Bs[TILE_K][TILE_N];
    
    int bx = blockIdx.x, by = blockIdx.y;
    int tx = threadIdx.x, ty = threadIdx.y;
    
    int row = by * TILE_M + ty;
    int col = bx * TILE_N + tx;
    
    float sum = 0.0f;
    int numTiles = (K + TILE_K - 1) / TILE_K;
    
    for (int t = 0; t < numTiles; t++) {
        // Load A tile (TILE_M √ó TILE_K)
        // Need multiple loads if TILE_K > blockDim.x
        for (int i = tx; i < TILE_K; i += blockDim.x) {
            int a_col = t * TILE_K + i;
            if (row < M && a_col < K)
                As[ty][i] = A[row * K + a_col];
            else
                As[ty][i] = 0.0f;
        }
        
        // Load B tile (TILE_K √ó TILE_N)
        for (int i = ty; i < TILE_K; i += blockDim.y) {
            int b_row = t * TILE_K + i;
            if (b_row < K && col < N)
                Bs[i][tx] = B[b_row * N + col];
            else
                Bs[i][tx] = 0.0f;
        }
        
        __syncthreads();
        
        // Compute partial product
        #pragma unroll
        for (int k = 0; k < TILE_K; k++) {
            sum += As[ty][k] * Bs[k][tx];
        }
        
        __syncthreads();
    }
    
    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

template<int TILE_SIZE>
float benchmark_tiled(const float* d_A, const float* d_B, float* d_C,
                      int M, int N, int K, int iterations) {
    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
    
    // Warmup
    matmul_tiled<TILE_SIZE><<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    cudaDeviceSynchronize();
    
    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < iterations; i++) {
        matmul_tiled<TILE_SIZE><<<grid, block>>>(d_A, d_B, d_C, M, N, K);
    }
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();
    
    return std::chrono::duration<float, std::milli>(end - start).count() / iterations;
}

void cpu_matmul(const float* A, const float* B, float* C, int M, int N, int K) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
            for (int k = 0; k < K; k++) {
                sum += A[i * K + k] * B[k * N + j];
            }
            C[i * N + j] = sum;
        }
    }
}

int main() {
    int M = 1024, K = 1024, N = 1024;
    int iterations = 10;
    
    printf("=== Exercise 1: Variable Tile Size Comparison ===\n\n");
    printf("Matrix size: %d √ó %d √ó %d\n\n", M, K, N);
    
    // Allocate host memory
    size_t size_A = M * K * sizeof(float);
    size_t size_B = K * N * sizeof(float);
    size_t size_C = M * N * sizeof(float);
    
    float *h_A = (float*)malloc(size_A);
    float *h_B = (float*)malloc(size_B);
    float *h_C = (float*)malloc(size_C);
    float *h_C_ref = (float*)malloc(size_C);
    
    srand(42);
    for (int i = 0; i < M * K; i++) h_A[i] = (float)rand() / RAND_MAX;
    for (int i = 0; i < K * N; i++) h_B[i] = (float)rand() / RAND_MAX;
    
    // Allocate device memory
    float *d_A, *d_B, *d_C;
    CHECK_CUDA(cudaMalloc(&d_A, size_A));
    CHECK_CUDA(cudaMalloc(&d_B, size_B));
    CHECK_CUDA(cudaMalloc(&d_C, size_C));
    
    CHECK_CUDA(cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice));
    
    float flops = 2.0f * M * N * K;
    
    printf("%-15s %-15s %-15s %-20s\n", "Tile Size", "Time (ms)", "GFLOPS", "Shared Mem (KB)");
    printf("%-15s %-15s %-15s %-20s\n", "----------", "----------", "----------", "--------------");
    
    // 8√ó8 tiles
    float time_8 = benchmark_tiled<8>(d_A, d_B, d_C, M, N, K, iterations);
    printf("%-15s %-15.3f %-15.2f %-20.2f\n", "8 √ó 8", time_8, 
           (flops / time_8) / 1e6, 2.0f * 8 * 8 * sizeof(float) / 1024.0f);
    
    // 16√ó16 tiles
    float time_16 = benchmark_tiled<16>(d_A, d_B, d_C, M, N, K, iterations);
    printf("%-15s %-15.3f %-15.2f %-20.2f\n", "16 √ó 16", time_16,
           (flops / time_16) / 1e6, 2.0f * 16 * 16 * sizeof(float) / 1024.0f);
    
    // 32√ó32 tiles
    float time_32 = benchmark_tiled<32>(d_A, d_B, d_C, M, N, K, iterations);
    printf("%-15s %-15.3f %-15.2f %-20.2f\n", "32 √ó 32", time_32,
           (flops / time_32) / 1e6, 2.0f * 32 * 32 * sizeof(float) / 1024.0f);
    
    // Verify 16√ó16 result
    matmul_tiled<16><<<dim3((N+15)/16, (M+15)/16), dim3(16, 16)>>>(d_A, d_B, d_C, M, N, K);
    CHECK_CUDA(cudaMemcpy(h_C, d_C, size_C, cudaMemcpyDeviceToHost));
    
    // CPU reference (only for small verification)
    printf("\nVerifying correctness (first 4x4 corner)...\n");
    cpu_matmul(h_A, h_B, h_C_ref, M, N, K);
    float max_error = 0.0f;
    for (int i = 0; i < M * N; i++) {
        float err = fabs(h_C[i] - h_C_ref[i]);
        if (err > max_error) max_error = err;
    }
    printf("Max error: %.6f %s\n", max_error, max_error < 0.01f ? "‚úì" : "‚úó");
    
    printf("\n=== Exercise 2: Rectangular Tiles ===\n\n");
    
    // Test rectangular tile configuration: 32√ó16 output tile, 16 K-dimension
    const int TILE_M = 32, TILE_N = 16, TILE_K = 16;
    
    dim3 block_rect(TILE_N, TILE_M);  // tx handles N, ty handles M
    dim3 grid_rect((N + TILE_N - 1) / TILE_N, (M + TILE_M - 1) / TILE_M);
    
    printf("Rectangular tile configuration: %d √ó %d output, %d K-stride\n", TILE_M, TILE_N, TILE_K);
    printf("Block dimensions: %d √ó %d = %d threads\n", TILE_N, TILE_M, TILE_N * TILE_M);
    printf("Shared memory: %.2f KB\n", (TILE_M * TILE_K + TILE_K * TILE_N) * sizeof(float) / 1024.0f);
    
    // Warmup and benchmark
    matmul_tiled_rect<TILE_M, TILE_N, TILE_K><<<grid_rect, block_rect>>>(d_A, d_B, d_C, M, N, K);
    cudaDeviceSynchronize();
    
    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < iterations; i++) {
        matmul_tiled_rect<TILE_M, TILE_N, TILE_K><<<grid_rect, block_rect>>>(d_A, d_B, d_C, M, N, K);
    }
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();
    
    float time_rect = std::chrono::duration<float, std::milli>(end - start).count() / iterations;
    printf("\nPerformance: %.3f ms, %.2f GFLOPS\n", time_rect, (flops / time_rect) / 1e6);
    
    printf("\nKey observations:\n");
    printf("- Larger tiles = more data reuse, but more shared memory\n");
    printf("- Rectangular tiles can optimize for specific matrix shapes\n");
    printf("- 32√ó32 often hits thread limit (1024), may reduce occupancy\n");
    
    // Cleanup
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C); free(h_C_ref);
    
    return 0;
}

In [None]:
# TODO: Create kernels with different tile sizes and compare
# Numba requires compile-time constants, so you'll need separate kernels

# Tile size 8
@cuda.jit
def matmul_tiled_8(A, B, C, M, N, K):
    pass  # Your implementation

# Tile size 32
@cuda.jit
def matmul_tiled_32(A, B, C, M, N, K):
    pass  # Your implementation

### Exercise 2: Rectangular Tiles

In [None]:
# TODO: Implement with non-square tiles (e.g., 32√ó16)
# This can sometimes improve register usage

@cuda.jit
def matmul_tiled_rect(A, B, C, M, N, K):
    """Tiled matmul with 32√ó16 tiles."""
    pass  # Your implementation

---

## Summary

### Tiling Strategy

| Aspect | Improvement |
|--------|-------------|
| Memory traffic | Reduced by TILE_SIZE factor |
| Arithmetic intensity | Increased from 1 to ~TILE_SIZE FLOP/byte |
| Performance | 2-10x speedup over naive |

### CUDA C++ Key Pattern

```cpp
__shared__ float As[TILE][TILE], Bs[TILE][TILE];

for (int t = 0; t < numTiles; t++) {
    // Load tiles with bounds checking
    As[ty][tx] = (in_bounds_A) ? A[...] : 0;
    Bs[ty][tx] = (in_bounds_B) ? B[...] : 0;
    __syncthreads();
    
    // Compute partial product
    for (int k = 0; k < TILE; k++)
        sum += As[ty][k] * Bs[k][tx];
    __syncthreads();
}
```

### Critical Points

1. **Two syncthreads per tile** - One after load, one after compute
2. **Bounds checking** - Pad with zeros for edge tiles
3. **Tile size** - Balance shared memory, threads, and reuse

### Next: Matrix Transpose
Tomorrow we'll optimize matrix transpose with coalesced access patterns.