In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: WMMA API Deep Dive

### Fragment Types and Layouts

```
WMMA operates on matrix "fragments" distributed across warp:

fragment<matrix_a, M, N, K, type, layout>  // Left operand
fragment<matrix_b, M, N, K, type, layout>  // Right operand  
fragment<accumulator, M, N, K, acc_type>   // Result/accumulator

Layouts:
  row_major: Elements in row-contiguous order
  col_major: Elements in column-contiguous order

Shapes supported (M√óN√óK):
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ   Shape    ‚îÇ  Input  ‚îÇ  Accum   ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ  16√ó16√ó16  ‚îÇ  FP16   ‚îÇ  FP32    ‚îÇ
‚îÇ  32√ó8√ó16   ‚îÇ  FP16   ‚îÇ  FP32    ‚îÇ
‚îÇ  8√ó32√ó16   ‚îÇ  FP16   ‚îÇ  FP32    ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
```

### WMMA Operations

```cpp
// Load from memory into fragment
wmma::load_matrix_sync(frag, ptr, stride);  

// Initialize accumulator
wmma::fill_fragment(frag, value);

// Matrix multiply-accumulate: D = A √ó B + C
wmma::mma_sync(d_frag, a_frag, b_frag, c_frag);

// Store fragment to memory
wmma::store_matrix_sync(ptr, frag, stride, layout);
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile wmma_tiled.cu
// wmma_tiled.cu - Optimized WMMA with shared memory tiling
#include <stdio.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <cuda_fp16.h>

using namespace nvcuda;

// WMMA tile dimensions
const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;

// Block tile dimensions (multiple WMMA tiles per block)
const int BLOCK_ROW_TILES = 4;  // 4 WMMA tiles in M direction
const int BLOCK_COL_TILES = 4;  // 4 WMMA tiles in N direction
const int BLOCK_K_TILES = 2;    // 2 WMMA tiles in K direction per step

const int BLOCK_M = BLOCK_ROW_TILES * WMMA_M;  // 64
const int BLOCK_N = BLOCK_COL_TILES * WMMA_N;  // 64
const int BLOCK_K = BLOCK_K_TILES * WMMA_K;    // 32

__global__ void wmma_matmul_tiled(
    half* __restrict__ A,
    half* __restrict__ B, 
    float* __restrict__ C,
    int M, int N, int K
) {
    // Shared memory for A and B tiles
    __shared__ half sA[BLOCK_M][BLOCK_K];
    __shared__ half sB[BLOCK_K][BLOCK_N];
    
    // Warp and lane indices
    int warpId = threadIdx.x / 32;
    int laneId = threadIdx.x % 32;
    
    // Each block has BLOCK_ROW_TILES * BLOCK_COL_TILES warps
    // Map warp to its WMMA tile within the block
    int warpRow = warpId / BLOCK_COL_TILES;
    int warpCol = warpId % BLOCK_COL_TILES;
    
    // Block position in output matrix
    int blockRow = blockIdx.y * BLOCK_M;
    int blockCol = blockIdx.x * BLOCK_N;
    
    // Declare accumulator fragments (one per warp)
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
    wmma::fill_fragment(c_frag, 0.0f);
    
    // Loop over K dimension
    for (int k = 0; k < K; k += BLOCK_K) {
        // Collaborative loading of A tile into shared memory
        // Each thread loads multiple elements
        for (int i = threadIdx.x; i < BLOCK_M * BLOCK_K; i += blockDim.x) {
            int row = i / BLOCK_K;
            int col = i % BLOCK_K;
            int gRow = blockRow + row;
            int gCol = k + col;
            sA[row][col] = (gRow < M && gCol < K) ? A[gRow * K + gCol] : __float2half(0.0f);
        }
        
        // Collaborative loading of B tile into shared memory
        for (int i = threadIdx.x; i < BLOCK_K * BLOCK_N; i += blockDim.x) {
            int row = i / BLOCK_N;
            int col = i % BLOCK_N;
            int gRow = k + row;
            int gCol = blockCol + col;
            sB[row][col] = (gRow < K && gCol < N) ? B[gRow * N + gCol] : __float2half(0.0f);
        }
        
        __syncthreads();
        
        // Each warp computes its WMMA tiles
        #pragma unroll
        for (int kTile = 0; kTile < BLOCK_K_TILES; kTile++) {
            wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
            wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
            
            // Load from shared memory
            wmma::load_matrix_sync(a_frag, &sA[warpRow * WMMA_M][kTile * WMMA_K], BLOCK_K);
            wmma::load_matrix_sync(b_frag, &sB[kTile * WMMA_K][warpCol * WMMA_N], BLOCK_N);
            
            // Multiply-accumulate
            wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
        }
        
        __syncthreads();
    }
    
    // Store result
    int cRow = blockRow + warpRow * WMMA_M;
    int cCol = blockCol + warpCol * WMMA_N;
    if (cRow < M && cCol < N) {
        wmma::store_matrix_sync(&C[cRow * N + cCol], c_frag, N, wmma::mem_row_major);
    }
}

// Standard CUDA matmul for comparison
__global__ void cuda_matmul(float* A, float* B, float* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

__global__ void float2half_kernel(float* in, half* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) out[idx] = __float2half(in[idx]);
}

int main() {
    const int M = 1024, N = 1024, K = 1024;
    
    printf("=== WMMA Tiled Matrix Multiply ===\n");
    printf("Matrix size: %d x %d x %d\n\n", M, N, K);
    
    // Allocate host memory
    float *h_A = new float[M * K];
    float *h_B = new float[K * N];
    float *h_C_wmma = new float[M * N];
    float *h_C_cuda = new float[M * N];
    
    // Initialize
    for (int i = 0; i < M * K; i++) h_A[i] = (rand() % 10) / 10.0f;
    for (int i = 0; i < K * N; i++) h_B[i] = (rand() % 10) / 10.0f;
    
    // Allocate device memory
    float *d_A, *d_B, *d_C_wmma, *d_C_cuda;
    half *d_A_half, *d_B_half;
    
    cudaMalloc(&d_A, M * K * sizeof(float));
    cudaMalloc(&d_B, K * N * sizeof(float));
    cudaMalloc(&d_A_half, M * K * sizeof(half));
    cudaMalloc(&d_B_half, K * N * sizeof(half));
    cudaMalloc(&d_C_wmma, M * N * sizeof(float));
    cudaMalloc(&d_C_cuda, M * N * sizeof(float));
    
    cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
    
    // Convert to half
    float2half_kernel<<<(M*K+255)/256, 256>>>(d_A, d_A_half, M*K);
    float2half_kernel<<<(K*N+255)/256, 256>>>(d_B, d_B_half, K*N);
    
    // WMMA kernel config
    // Each block: BLOCK_ROW_TILES * BLOCK_COL_TILES warps = 16 warps = 512 threads
    dim3 wmma_block(BLOCK_ROW_TILES * BLOCK_COL_TILES * 32);  // 512 threads
    dim3 wmma_grid((N + BLOCK_N - 1) / BLOCK_N, (M + BLOCK_M - 1) / BLOCK_M);
    
    // CUDA kernel config
    dim3 cuda_block(16, 16);
    dim3 cuda_grid((N + 15) / 16, (M + 15) / 16);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float wmma_time, cuda_time;
    
    // Benchmark WMMA
    wmma_matmul_tiled<<<wmma_grid, wmma_block>>>(d_A_half, d_B_half, d_C_wmma, M, N, K);
    cudaDeviceSynchronize();
    
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        wmma_matmul_tiled<<<wmma_grid, wmma_block>>>(d_A_half, d_B_half, d_C_wmma, M, N, K);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&wmma_time, start, stop);
    
    // Benchmark CUDA
    cuda_matmul<<<cuda_grid, cuda_block>>>(d_A, d_B, d_C_cuda, M, N, K);
    cudaDeviceSynchronize();
    
    cudaEventRecord(start);
    for (int i = 0; i < 100; i++) {
        cuda_matmul<<<cuda_grid, cuda_block>>>(d_A, d_B, d_C_cuda, M, N, K);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&cuda_time, start, stop);
    
    // Results
    printf("Performance Results (100 iterations):\n");
    printf("  WMMA Tensor Core: %.2f ms (%.2f TFLOPS)\n", 
           wmma_time, (2.0 * M * N * K * 100) / (wmma_time * 1e9));
    printf("  CUDA Naive:       %.2f ms (%.2f TFLOPS)\n",
           cuda_time, (2.0 * M * N * K * 100) / (cuda_time * 1e9));
    printf("  Speedup:          %.2fx\n", cuda_time / wmma_time);
    
    // Cleanup
    delete[] h_A; delete[] h_B; delete[] h_C_wmma; delete[] h_C_cuda;
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_A_half); cudaFree(d_B_half);
    cudaFree(d_C_wmma); cudaFree(d_C_cuda);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o wmma_tiled wmma_tiled.cu
!./wmma_tiled

---

## Part 2: Memory Layout Optimization

### Layout Considerations

```
Tensor Cores support both row-major and column-major:

Row-Major (C-style):
  A[i][j] stored at A + i*cols + j
  Memory: [row0][row1][row2]...

Column-Major (Fortran-style):
  A[i][j] stored at A + j*rows + i  
  Memory: [col0][col1][col2]...

For C = A √ó B:
  - A: row_major efficient (access rows)
  - B: col_major efficient (access cols)
  - OR: B transposed in row_major
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile wmma_layouts.cu
// wmma_layouts.cu - Different memory layouts
#include <stdio.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <cuda_fp16.h>

using namespace nvcuda;

const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;

// A: row-major, B: row-major
__global__ void wmma_row_row(
    half* A, half* B, float* C, int M, int N, int K
) {
    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
    int warpN = blockIdx.y;
    
    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a;
    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b;
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c;
    wmma::fill_fragment(c, 0.0f);
    
    for (int k = 0; k < K; k += WMMA_K) {
        wmma::load_matrix_sync(a, A + warpM*WMMA_M*K + k, K);
        wmma::load_matrix_sync(b, B + k*N + warpN*WMMA_N, N);
        wmma::mma_sync(c, a, b, c);
    }
    
    wmma::store_matrix_sync(C + warpM*WMMA_M*N + warpN*WMMA_N, c, N, wmma::mem_row_major);
}

// A: row-major, B: col-major (transposed storage)
__global__ void wmma_row_col(
    half* A, half* B_col, float* C, int M, int N, int K
) {
    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
    int warpN = blockIdx.y;
    
    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a;
    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b;
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c;
    wmma::fill_fragment(c, 0.0f);
    
    for (int k = 0; k < K; k += WMMA_K) {
        wmma::load_matrix_sync(a, A + warpM*WMMA_M*K + k, K);
        // B_col is stored column-major: B_col[k][n] at B_col + n*K + k
        wmma::load_matrix_sync(b, B_col + warpN*WMMA_N*K + k, K);
        wmma::mma_sync(c, a, b, c);
    }
    
    wmma::store_matrix_sync(C + warpM*WMMA_M*N + warpN*WMMA_N, c, N, wmma::mem_row_major);
}

int main() {
    printf("=== WMMA Memory Layouts ===\n");
    printf("Both row-row and row-col layouts are valid.\n");
    printf("Choose based on your input data format.\n");
    printf("\nFor best performance:\n");
    printf("  - A in row-major (sequential row access)\n");
    printf("  - B in col-major (sequential col access)\n");
    printf("  - Or: transpose B before kernel\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o wmma_layouts wmma_layouts.cu
!./wmma_layouts

---

## Part 3: Handling Non-Aligned Sizes

### Padding Strategy

```
When M, N, K are not multiples of 16:

Option 1: Pad matrices
  M_padded = ((M + 15) / 16) * 16
  Allocate larger, zero-pad edges

Option 2: Handle edge tiles specially
  Full WMMA for interior tiles
  Scalar code for edge elements

Option 3: Clamp in load/store
  Load zeros for out-of-bounds
  Skip storing out-of-bounds
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile wmma_padding.cu
// wmma_padding.cu - Handle non-aligned matrix sizes
#include <stdio.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <cuda_fp16.h>

using namespace nvcuda;

const int WMMA_M = 16;
const int WMMA_N = 16;
const int WMMA_K = 16;

// Pad value to multiple of 16
__host__ __device__ int pad16(int val) {
    return ((val + 15) / 16) * 16;
}

// Create padded copy of matrix
__global__ void pad_matrix(
    half* src, half* dst,
    int srcRows, int srcCols,
    int dstRows, int dstCols
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total = dstRows * dstCols;
    
    for (int i = idx; i < total; i += blockDim.x * gridDim.x) {
        int row = i / dstCols;
        int col = i % dstCols;
        
        if (row < srcRows && col < srcCols) {
            dst[i] = src[row * srcCols + col];
        } else {
            dst[i] = __float2half(0.0f);
        }
    }
}

// Extract result from padded matrix
__global__ void unpad_matrix(
    float* src, float* dst,
    int srcRows, int srcCols,
    int dstRows, int dstCols
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int total = dstRows * dstCols;
    
    for (int i = idx; i < total; i += blockDim.x * gridDim.x) {
        int row = i / dstCols;
        int col = i % dstCols;
        dst[i] = src[row * srcCols + col];
    }
}

int main() {
    // Non-aligned sizes
    int M = 100, N = 200, K = 150;
    
    // Padded sizes
    int M_pad = pad16(M);  // 112
    int N_pad = pad16(N);  // 208
    int K_pad = pad16(K);  // 160
    
    printf("Original sizes: %d x %d x %d\n", M, N, K);
    printf("Padded sizes:   %d x %d x %d\n", M_pad, N_pad, K_pad);
    printf("\nPadding overhead:\n");
    printf("  A: %.1f%% extra elements\n", 
           100.0 * (M_pad*K_pad - M*K) / (M*K));
    printf("  B: %.1f%% extra elements\n",
           100.0 * (K_pad*N_pad - K*N) / (K*N));
    printf("  C: %.1f%% extra elements\n",
           100.0 * (M_pad*N_pad - M*N) / (M*N));
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o wmma_padding wmma_padding.cu
!./wmma_padding

---

## Exercises

### Exercise 1: Implement Batched WMMA

Extend the WMMA kernel to handle batched matrix multiplication:
`C[b] = A[b] √ó B[b]` for batch size B.

In [None]:
# Exercise: Plan your batched WMMA implementation

def plan_batched_wmma():
    print("Batched WMMA Implementation Plan")
    print("=" * 50)
    print()
    print("Option 1: Loop in host")
    print("  for b in range(batch):")
    print("    wmma_kernel(A[b], B[b], C[b])")
    print("  Pros: Simple, reuse existing kernel")
    print("  Cons: Kernel launch overhead")
    print()
    print("Option 2: Batch dimension in grid")
    print("  grid = (tiles_n, tiles_m, batch)")
    print("  batch_idx = blockIdx.z")
    print("  Pros: Single launch")
    print("  Cons: Limited by max grid dimensions")
    print()
    print("Option 3: Strided batching")
    print("  Multiple matrices per block")
    print("  Pros: High occupancy")
    print("  Cons: Complex implementation")

plan_batched_wmma()

---

## Summary

### WMMA API Reference

```cpp
// Fragment types
fragment<matrix_a, M, N, K, half, layout> a_frag;
fragment<matrix_b, M, N, K, half, layout> b_frag;
fragment<accumulator, M, N, K, float> c_frag;

// Operations
load_matrix_sync(frag, ptr, stride);
fill_fragment(frag, value);
mma_sync(d, a, b, c);  // d = a √ó b + c
store_matrix_sync(ptr, frag, stride, layout);
```

### Optimization Tips

1. Use shared memory for tile loading
2. Ensure coalesced global memory access
3. Pad matrices to multiples of 16
4. Match layout to your data format
5. Unroll K-dimension loop

### Tomorrow: Mixed Precision Training
We'll apply Tensor Cores to neural network training.