In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: cuBLAS Math Modes

### Tensor Core Configuration

```
cuBLAS Math Modes:

CUBLAS_DEFAULT_MATH
‚îú‚îÄ‚îÄ Uses CUDA cores only (FP32)
‚îú‚îÄ‚îÄ Maximum precision
‚îî‚îÄ‚îÄ Baseline performance

CUBLAS_TENSOR_OP_MATH (deprecated)
‚îú‚îÄ‚îÄ Allow Tensor Cores
‚îú‚îÄ‚îÄ FP16 compute internally
‚îî‚îÄ‚îÄ Higher performance

CUBLAS_TF32_TENSOR_OP_MATH (Ampere+)
‚îú‚îÄ‚îÄ TF32 for FP32 inputs
‚îú‚îÄ‚îÄ 8x FP32 performance
‚îî‚îÄ‚îÄ Slight precision loss

cublasSetMathMode(handle, mode);
```

### Compute Types

```
cublasGemmEx parameters:

  computeType = CUBLAS_COMPUTE_16F   // FP16 Tensor Cores
  computeType = CUBLAS_COMPUTE_32F   // FP32 (default)
  computeType = CUBLAS_COMPUTE_32F_FAST_16F  // TF32
  computeType = CUBLAS_COMPUTE_32F_FAST_TF32 // Ampere TF32
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile cublas_tensor.cu
// cublas_tensor.cu - cuBLAS with Tensor Cores
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>

#define CHECK_CUBLAS(call) { \
    cublasStatus_t status = call; \
    if (status != CUBLAS_STATUS_SUCCESS) { \
        printf("cuBLAS error at line %d: %d\n", __LINE__, status); \
        exit(1); \
    } \
}

void benchmarkGemm(
    cublasHandle_t handle,
    int M, int N, int K,
    cublasComputeType_t computeType,
    const char* name
) {
    // Allocate FP32 matrices
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, M * K * sizeof(float));
    cudaMalloc(&d_B, K * N * sizeof(float));
    cudaMalloc(&d_C, M * N * sizeof(float));
    
    // Initialize with random values
    float *h_A = new float[M * K];
    float *h_B = new float[K * N];
    for (int i = 0; i < M * K; i++) h_A[i] = (rand() % 100) / 100.0f;
    for (int i = 0; i < K * N; i++) h_B[i] = (rand() % 100) / 100.0f;
    cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
    
    float alpha = 1.0f, beta = 0.0f;
    
    // Warmup
    CHECK_CUBLAS(cublasGemmEx(
        handle, CUBLAS_OP_N, CUBLAS_OP_N,
        N, M, K,  // Note: cuBLAS is column-major
        &alpha,
        d_B, CUDA_R_32F, N,
        d_A, CUDA_R_32F, K,
        &beta,
        d_C, CUDA_R_32F, N,
        computeType, CUBLAS_GEMM_DEFAULT_TENSOR_OP
    ));
    cudaDeviceSynchronize();
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int iterations = 100;
    cudaEventRecord(start);
    for (int i = 0; i < iterations; i++) {
        cublasGemmEx(
            handle, CUBLAS_OP_N, CUBLAS_OP_N,
            N, M, K,
            &alpha,
            d_B, CUDA_R_32F, N,
            d_A, CUDA_R_32F, K,
            &beta,
            d_C, CUDA_R_32F, N,
            computeType, CUBLAS_GEMM_DEFAULT_TENSOR_OP
        );
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    double flops = 2.0 * M * N * K * iterations;
    double tflops = flops / (ms * 1e9);
    
    printf("%s:\n", name);
    printf("  Time: %.2f ms (%d iterations)\n", ms, iterations);
    printf("  Performance: %.2f TFLOPS\n\n", tflops);
    
    // Cleanup
    delete[] h_A;
    delete[] h_B;
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

int main() {
    printf("=== cuBLAS Tensor Core Benchmark ===\n\n");
    
    cublasHandle_t handle;
    CHECK_CUBLAS(cublasCreate(&handle));
    
    int M = 4096, N = 4096, K = 4096;
    printf("Matrix size: %d x %d x %d\n\n", M, N, K);
    
    // Test different compute types
    benchmarkGemm(handle, M, N, K, CUBLAS_COMPUTE_32F, "FP32 (CUDA Cores)");
    benchmarkGemm(handle, M, N, K, CUBLAS_COMPUTE_32F_FAST_16F, "FP32 with FP16 Tensor Cores");
    
    cublasDestroy(handle);
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -lcublas -o cublas_tensor cublas_tensor.cu
!./cublas_tensor

---

## Part 2: Half Precision cuBLAS

### cublasHgemm

```cpp
// Native FP16 GEMM
cublasHgemm(
    handle,
    CUBLAS_OP_N, CUBLAS_OP_N,
    N, M, K,
    &alpha_h,  // half precision scalars
    B_h, N,
    A_h, K,
    &beta_h,
    C_h, N
);
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile cublas_hgemm.cu
// cublas_hgemm.cu - Half precision GEMM
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>

#define CHECK_CUBLAS(call) { \
    cublasStatus_t status = call; \
    if (status != CUBLAS_STATUS_SUCCESS) { \
        printf("cuBLAS error: %d\n", status); exit(1); \
    } \
}

__global__ void float2half_kernel(float* in, half* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) out[idx] = __float2half(in[idx]);
}

__global__ void half2float_kernel(half* in, float* out, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) out[idx] = __half2float(in[idx]);
}

int main() {
    printf("=== cuBLAS Half Precision GEMM ===\n\n");
    
    cublasHandle_t handle;
    CHECK_CUBLAS(cublasCreate(&handle));
    
    const int M = 4096, N = 4096, K = 4096;
    printf("Matrix size: %d x %d x %d\n\n", M, N, K);
    
    // Allocate
    float *h_A = new float[M * K];
    float *h_B = new float[K * N];
    float *h_C = new float[M * N];
    
    for (int i = 0; i < M * K; i++) h_A[i] = (rand() % 100) / 100.0f;
    for (int i = 0; i < K * N; i++) h_B[i] = (rand() % 100) / 100.0f;
    
    float *d_A_f, *d_B_f, *d_C_f;
    half *d_A_h, *d_B_h, *d_C_h;
    
    cudaMalloc(&d_A_f, M * K * sizeof(float));
    cudaMalloc(&d_B_f, K * N * sizeof(float));
    cudaMalloc(&d_C_f, M * N * sizeof(float));
    cudaMalloc(&d_A_h, M * K * sizeof(half));
    cudaMalloc(&d_B_h, K * N * sizeof(half));
    cudaMalloc(&d_C_h, M * N * sizeof(half));
    
    cudaMemcpy(d_A_f, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B_f, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
    
    // Convert to half
    float2half_kernel<<<(M*K+255)/256, 256>>>(d_A_f, d_A_h, M*K);
    float2half_kernel<<<(K*N+255)/256, 256>>>(d_B_f, d_B_h, K*N);
    
    half alpha_h = __float2half(1.0f);
    half beta_h = __float2half(0.0f);
    
    // Warmup
    CHECK_CUBLAS(cublasHgemm(
        handle, CUBLAS_OP_N, CUBLAS_OP_N,
        N, M, K,
        &alpha_h,
        d_B_h, N,
        d_A_h, K,
        &beta_h,
        d_C_h, N
    ));
    cudaDeviceSynchronize();
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int iterations = 100;
    cudaEventRecord(start);
    for (int i = 0; i < iterations; i++) {
        cublasHgemm(
            handle, CUBLAS_OP_N, CUBLAS_OP_N,
            N, M, K,
            &alpha_h,
            d_B_h, N,
            d_A_h, K,
            &beta_h,
            d_C_h, N
        );
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    double flops = 2.0 * M * N * K * iterations;
    double tflops = flops / (ms * 1e9);
    
    printf("cublasHgemm (FP16 Tensor Cores):\n");
    printf("  Time: %.2f ms (%d iterations)\n", ms, iterations);
    printf("  Performance: %.2f TFLOPS\n\n", tflops);
    
    // Memory comparison
    printf("Memory Usage:\n");
    printf("  FP32: %.1f MB per matrix\n", M * K * 4.0 / 1e6);
    printf("  FP16: %.1f MB per matrix\n", M * K * 2.0 / 1e6);
    printf("  Savings: 50%%\n");
    
    // Cleanup
    delete[] h_A; delete[] h_B; delete[] h_C;
    cudaFree(d_A_f); cudaFree(d_B_f); cudaFree(d_C_f);
    cudaFree(d_A_h); cudaFree(d_B_h); cudaFree(d_C_h);
    cublasDestroy(handle);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -lcublas -o cublas_hgemm cublas_hgemm.cu
!./cublas_hgemm

---

## Part 3: Batched Operations

### cublasGemmStridedBatchedEx

```
Batched GEMM: Multiple matrix multiplies in one call

C[i] = alpha * A[i] * B[i] + beta * C[i]

Use cases:
  - Transformer attention heads
  - Batch normalization
  - Multi-sample inference
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile cublas_batched.cu
// cublas_batched.cu - Batched GEMM for attention
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>

#define CHECK_CUBLAS(call) { \
    cublasStatus_t status = call; \
    if (status != CUBLAS_STATUS_SUCCESS) { \
        printf("cuBLAS error: %d\n", status); exit(1); \
    } \
}

int main() {
    printf("=== Batched GEMM for Attention ===\n\n");
    
    cublasHandle_t handle;
    CHECK_CUBLAS(cublasCreate(&handle));
    
    // Typical attention dimensions
    int batch_size = 32;
    int num_heads = 8;
    int seq_len = 512;
    int head_dim = 64;
    
    // Q * K^T: (batch*heads, seq, head_dim) x (batch*heads, head_dim, seq)
    int batch_count = batch_size * num_heads;
    int M = seq_len, N = seq_len, K = head_dim;
    
    printf("Attention configuration:\n");
    printf("  Batch size: %d\n", batch_size);
    printf("  Num heads: %d\n", num_heads);
    printf("  Sequence length: %d\n", seq_len);
    printf("  Head dimension: %d\n", head_dim);
    printf("  Total batched GEMMs: %d\n\n", batch_count);
    
    // Allocate (Q, K, attention_scores)
    half *d_Q, *d_K, *d_scores;
    cudaMalloc(&d_Q, batch_count * M * K * sizeof(half));
    cudaMalloc(&d_K, batch_count * K * N * sizeof(half));
    cudaMalloc(&d_scores, batch_count * M * N * sizeof(half));
    
    // Initialize with dummy data
    cudaMemset(d_Q, 0, batch_count * M * K * sizeof(half));
    cudaMemset(d_K, 0, batch_count * K * N * sizeof(half));
    
    half alpha = __float2half(1.0f / sqrtf(head_dim));  // Scale factor
    half beta = __float2half(0.0f);
    
    // Strides for batched GEMM
    long long int strideQ = M * K;
    long long int strideK = K * N;
    long long int strideScores = M * N;
    
    // Warmup
    CHECK_CUBLAS(cublasGemmStridedBatchedEx(
        handle,
        CUBLAS_OP_T, CUBLAS_OP_N,  // K^T * Q
        N, M, K,
        &alpha,
        d_K, CUDA_R_16F, K, strideK,
        d_Q, CUDA_R_16F, K, strideQ,
        &beta,
        d_scores, CUDA_R_16F, N, strideScores,
        batch_count,
        CUBLAS_COMPUTE_16F,
        CUBLAS_GEMM_DEFAULT_TENSOR_OP
    ));
    cudaDeviceSynchronize();
    
    // Benchmark
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int iterations = 100;
    cudaEventRecord(start);
    for (int i = 0; i < iterations; i++) {
        cublasGemmStridedBatchedEx(
            handle,
            CUBLAS_OP_T, CUBLAS_OP_N,
            N, M, K,
            &alpha,
            d_K, CUDA_R_16F, K, strideK,
            d_Q, CUDA_R_16F, K, strideQ,
            &beta,
            d_scores, CUDA_R_16F, N, strideScores,
            batch_count,
            CUBLAS_COMPUTE_16F,
            CUBLAS_GEMM_DEFAULT_TENSOR_OP
        );
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    double flops = 2.0 * M * N * K * batch_count * iterations;
    double tflops = flops / (ms * 1e9);
    
    printf("Q*K^T Performance:\n");
    printf("  Time: %.2f ms (%d iterations)\n", ms, iterations);
    printf("  Performance: %.2f TFLOPS\n", tflops);
    printf("  Per-iteration: %.3f ms\n", ms / iterations);
    
    // Cleanup
    cudaFree(d_Q);
    cudaFree(d_K);
    cudaFree(d_scores);
    cublasDestroy(handle);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -lcublas -o cublas_batched cublas_batched.cu
!./cublas_batched

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
# Python equivalent using CuPy (if available)
def explain_cublas_functions():
    print("cuBLAS Function Reference")
    print("=" * 50)
    print()
    print("GEMM Functions:")
    print("  cublasSgemm    - Single precision (FP32)")
    print("  cublasDgemm    - Double precision (FP64)")
    print("  cublasHgemm    - Half precision (FP16)")
    print("  cublasGemmEx   - Extended (any type)")
    print()
    print("Batched GEMM:")
    print("  cublasSgemmBatched         - Array of pointers")
    print("  cublasSgemmStridedBatched  - Strided memory")
    print("  cublasGemmStridedBatchedEx - Extended batched")
    print()
    print("Tensor Core Compute Types:")
    print("  CUBLAS_COMPUTE_16F         - FP16")
    print("  CUBLAS_COMPUTE_32F         - FP32")
    print("  CUBLAS_COMPUTE_32F_FAST_16F - FP16 Tensor Cores")
    print("  CUBLAS_COMPUTE_32F_FAST_TF32 - TF32 (Ampere+)")

explain_cublas_functions()

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile cublas_tensor_exercises.cu
// CUDA C++ cuBLAS Tensor Core Exercises
// Week 13, Day 4: cuBLAS with Tensor Cores

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include <stdio.h>

// Exercise 1: Basic cublasGemmEx with Tensor Cores
// TODO: Set up cuBLAS handle and enable Tensor Core math mode

// Exercise 2: Batched GEMM for transformer attention
// TODO: Use cublasGemmBatchedEx for Q*K^T and attention*V

// Exercise 3: Compare Tensor Core vs standard GEMM performance
// TODO: Benchmark CUBLAS_TENSOR_OP_MATH vs CUBLAS_DEFAULT_MATH

int main() {
    printf("cuBLAS Tensor Core Exercises - Implement your solutions above\n");
    printf("Compile with: nvcc -arch=sm_70 cublas_tensor_exercises.cu -lcublas -o cublas_tensor_exercises\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_70 cublas_tensor_exercises.cu -lcublas -o cublas_tensor_exercises && ./cublas_tensor_exercises

### üî∂ Python/CuPy Exercises (Alternative)

---

## Summary

### cuBLAS Tensor Core Usage

```cpp
// Enable Tensor Cores
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

// Or use GemmEx with appropriate compute type
cublasGemmEx(..., CUBLAS_COMPUTE_32F_FAST_16F, ...);
```

### Best Practices

1. Use `cublasGemmEx` for flexibility
2. Choose compute type based on accuracy needs
3. Use batched GEMM for attention/transformers
4. Ensure matrix dimensions are Tensor Core friendly

### Week 13 Complete!
You've learned Tensor Cores, WMMA, mixed precision, and cuBLAS integration.