In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("⚠️  CUDA C++ is the PRIMARY learning material!")

---

## Part 1: Unified Memory Basics

### What is Unified Memory?

```
Traditional CUDA:             Unified Memory:

┌──────────────┐              ┌──────────────┐
│   CPU        │              │   CPU        │
│   Memory     │              │              │
│   h_data     │              │              │
└──────┬───────┘              │              │
       │ cudaMemcpy()         │   Unified    │
       ↓                      │   Address    │
┌──────────────┐              │   Space      │
│   GPU        │              │              │
│   Memory     │              │   data       │ ← One pointer!
│   d_data     │              │              │
└──────────────┘              └──────────────┘

Two pointers,                 One pointer,
explicit copies               automatic migration
```

### CUDA C++ Unified Memory (Primary)

```cpp
// unified_memory.cu - Unified memory basics
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void addKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] += 1.0f;
    }
}

int main() {
    int n = 1 << 20;  // 1M elements
    size_t size = n * sizeof(float);
    
    // ========== TRADITIONAL APPROACH ==========
    {
        float *h_data = (float*)malloc(size);
        float *d_data;
        cudaMalloc(&d_data, size);
        
        // Initialize on host
        for (int i = 0; i < n; i++) h_data[i] = i;
        
        // Copy to device
        cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
        
        // Launch kernel
        addKernel<<<(n+255)/256, 256>>>(d_data, n);
        
        // Copy back
        cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
        
        printf("Traditional: h_data[0] = %f\n", h_data[0]);
        
        free(h_data);
        cudaFree(d_data);
    }
    
    // ========== UNIFIED MEMORY APPROACH ==========
    {
        float *data;
        cudaMallocManaged(&data, size);  // One allocation!
        
        // Initialize on host (no copy needed!)
        for (int i = 0; i < n; i++) data[i] = i;
        
        // Launch kernel (no copy needed!)
        addKernel<<<(n+255)/256, 256>>>(data, n);
        cudaDeviceSynchronize();
        
        // Use on host (no copy needed!)
        printf("Unified: data[0] = %f\n", data[0]);
        
        cudaFree(data);
    }
    
    return 0;
}
```

In [None]:
# Python/Numba - Managed memory example

@cuda.jit
def add_one(data):
    idx = cuda.grid(1)
    if idx < data.size:
        data[idx] += 1.0

# Using managed memory (simplified API)
def unified_memory_demo():
    n = 1_000_000
    
    # Create managed array
    # Note: Numba handles this through cuda.to_device or 
    # cuda.managed_array (if available)
    
    # Standard approach (for comparison)
    host_data = np.arange(n, dtype=np.float32)
    device_data = cuda.to_device(host_data)
    
    block = 256
    grid = (n + block - 1) // block
    
    add_one[grid, block](device_data)
    cuda.synchronize()
    
    result = device_data.copy_to_host()
    print(f"First elements: {result[:5]}")
    print(f"Expected: [1. 2. 3. 4. 5.]")

unified_memory_demo()

---

## Part 2: Page Migration

In [None]:
def explain_page_migration():
    """Explain how unified memory page migration works."""
    print("Unified Memory Page Migration")
    print("=" * 60)
    print()
    print("How it works:")
    print("  1. Memory allocated as 'managed' pages")
    print("  2. Pages migrate on demand (page fault)")
    print("  3. OS/driver handles migration transparently")
    print()
    print("Page fault flow:")
    print("  GPU kernel accesses page → Page not on GPU")
    print("  → Page fault triggered → Migration from CPU to GPU")
    print("  → Kernel resumes with page now on GPU")
    print()
    print("Page sizes:")
    print("  CPU: 4 KB (standard) or 2 MB (huge pages)")
    print("  GPU: 64 KB (Pascal+) or 2 MB (large page mode)")
    print()
    print("Migration overhead:")
    print("  - Page fault handling: ~20-50 µs")
    print("  - Data transfer: depends on page size and PCIe/NVLink")
    print("  - Can be significant for random access patterns!")

explain_page_migration()

### Prefetching to Avoid Page Faults

```cpp
// prefetch.cu - Prefetching for better performance
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = sqrtf(data[idx]);
    }
}

int main() {
    int n = 1 << 24;  // 16M elements
    size_t size = n * sizeof(float);
    
    float *data;
    cudaMallocManaged(&data, size);
    
    // Initialize on CPU
    for (int i = 0; i < n; i++) data[i] = (float)i;
    
    int device;
    cudaGetDevice(&device);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // ===== WITHOUT PREFETCH =====
    cudaEventRecord(start);
    processKernel<<<(n+255)/256, 256>>>(data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms_no_prefetch;
    cudaEventElapsedTime(&ms_no_prefetch, start, stop);
    printf("Without prefetch: %.2f ms\n", ms_no_prefetch);
    
    // Reset data to CPU
    cudaMemPrefetchAsync(data, size, cudaCpuDeviceId);
    cudaDeviceSynchronize();
    for (int i = 0; i < n; i++) data[i] = (float)i;
    
    // ===== WITH PREFETCH =====
    cudaEventRecord(start);
    cudaMemPrefetchAsync(data, size, device);  // Prefetch to GPU
    processKernel<<<(n+255)/256, 256>>>(data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms_with_prefetch;
    cudaEventElapsedTime(&ms_with_prefetch, start, stop);
    printf("With prefetch: %.2f ms\n", ms_with_prefetch);
    
    printf("Speedup: %.2fx\n", ms_no_prefetch / ms_with_prefetch);
    
    cudaFree(data);
    return 0;
}
```

---

## Part 3: Memory Hints

In [None]:
def memory_hints():
    """Explain CUDA memory advise hints."""
    print("cudaMemAdvise Hints")
    print("=" * 60)
    print()
    print("cudaMemAdviseSetReadMostly")
    print("  - Hint: Data will be read, rarely written")
    print("  - Effect: May duplicate to avoid migration")
    print("  - Use: Lookup tables, constant data")
    print()
    print("cudaMemAdviseSetPreferredLocation")
    print("  - Hint: Preferred location for data")
    print("  - Effect: Tries to keep data at specified location")
    print("  - Use: Data primarily used by one processor")
    print()
    print("cudaMemAdviseSetAccessedBy")
    print("  - Hint: Which devices will access data")
    print("  - Effect: Creates direct mapping if possible")
    print("  - Use: Multi-GPU scenarios")
    print()
    print("Example usage:")
    print("  cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, 0);")
    print("  cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device);")
    print("  cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device);")

memory_hints()

### CUDA C++ Memory Advise (Primary)

```cpp
// advise.cu - Using memory hints
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void lookupKernel(const float* table, const int* indices,
                              float* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        output[idx] = table[indices[idx]];
    }
}

int main() {
    int tableSize = 1 << 20;   // 1M lookup table
    int n = 1 << 24;           // 16M lookups
    
    float *table;
    int *indices;
    float *output;
    
    cudaMallocManaged(&table, tableSize * sizeof(float));
    cudaMallocManaged(&indices, n * sizeof(int));
    cudaMallocManaged(&output, n * sizeof(float));
    
    // Initialize
    for (int i = 0; i < tableSize; i++) table[i] = sqrtf(i);
    for (int i = 0; i < n; i++) indices[i] = rand() % tableSize;
    
    int device;
    cudaGetDevice(&device);
    
    // ===== APPLY HINTS =====
    
    // Table is read-only - can be duplicated
    cudaMemAdvise(table, tableSize * sizeof(float),
                  cudaMemAdviseSetReadMostly, 0);
    
    // Indices and output should prefer GPU
    cudaMemAdvise(indices, n * sizeof(int),
                  cudaMemAdviseSetPreferredLocation, device);
    cudaMemAdvise(output, n * sizeof(float),
                  cudaMemAdviseSetPreferredLocation, device);
    
    // Prefetch to GPU
    cudaMemPrefetchAsync(table, tableSize * sizeof(float), device);
    cudaMemPrefetchAsync(indices, n * sizeof(int), device);
    
    // Launch kernel
    lookupKernel<<<(n+255)/256, 256>>>(table, indices, output, n);
    cudaDeviceSynchronize();
    
    // Prefetch output back to CPU for verification
    cudaMemPrefetchAsync(output, n * sizeof(float), cudaCpuDeviceId);
    cudaDeviceSynchronize();
    
    printf("output[0] = %f (expected: %f)\n", 
           output[0], table[indices[0]]);
    
    cudaFree(table);
    cudaFree(indices);
    cudaFree(output);
    
    return 0;
}
```

---

## Part 4: When to Use Unified Memory

In [None]:
def unified_vs_explicit():
    """Compare unified vs explicit memory management."""
    print("Unified vs Explicit Memory")
    print("=" * 60)
    print()
    print("USE UNIFIED MEMORY WHEN:")
    print("  ✓ Prototyping and development")
    print("  ✓ Complex data structures (linked lists, trees)")
    print("  ✓ Oversubscription (data larger than GPU memory)")
    print("  ✓ Unclear access patterns")
    print("  ✓ Porting CPU code quickly")
    print()
    print("USE EXPLICIT MEMORY WHEN:")
    print("  ✓ Maximum performance critical")
    print("  ✓ Predictable access patterns")
    print("  ✓ Frequent CPU-GPU ping-pong")
    print("  ✓ Fine-grained control needed")
    print("  ✓ Overlapping compute and transfer")
    print()
    print("PERFORMANCE CONSIDERATIONS:")
    print("  - Page faults have ~20-50 µs overhead each")
    print("  - First access triggers migration")
    print("  - Random access patterns = many page faults")
    print("  - Prefetching mitigates most overhead")
    print("  - With proper hints, ~95%+ of explicit performance")

unified_vs_explicit()

---

## Part 5: Oversubscription

In [None]:
def explain_oversubscription():
    """Explain memory oversubscription with unified memory."""
    print("Memory Oversubscription")
    print("=" * 60)
    print()
    print("Traditional CUDA:")
    print("  GPU memory = hard limit")
    print("  cudaMalloc fails if not enough memory")
    print()
    print("Unified Memory (Pascal+):")
    print("  Can allocate more than GPU memory!")
    print("  Pages migrate as needed")
    print("  Works like virtual memory")
    print()
    print("Example:")
    print("  GPU has 8 GB memory")
    print("  Allocate 32 GB with cudaMallocManaged")
    print("  Process 8 GB at a time on GPU")
    print("  Pages swap automatically")
    print()
    print("Caveats:")
    print("  - Performance degrades with thrashing")
    print("  - Need good access locality")
    print("  - Consider prefetch hints")

explain_oversubscription()

### CUDA C++ Oversubscription Example

```cpp
// oversubscription.cu - Using more memory than GPU has
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processChunk(float* data, int start, int chunk_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < chunk_size) {
        data[start + idx] = sqrtf(data[start + idx]);
    }
}

int main() {
    // Query GPU memory
    size_t free_mem, total_mem;
    cudaMemGetInfo(&free_mem, &total_mem);
    printf("GPU Memory: %.1f GB free, %.1f GB total\n",
           free_mem / 1e9, total_mem / 1e9);
    
    // Allocate MORE than GPU memory
    size_t n = total_mem / sizeof(float) * 2;  // 2x GPU memory
    size_t size = n * sizeof(float);
    printf("Allocating %.1f GB (2x GPU memory)\n", size / 1e9);
    
    float *data;
    cudaError_t err = cudaMallocManaged(&data, size);
    if (err != cudaSuccess) {
        printf("Allocation failed: %s\n", cudaGetErrorString(err));
        return 1;
    }
    
    // Initialize on CPU (pages stay on CPU)
    for (size_t i = 0; i < n; i++) {
        data[i] = (float)(i % 1000);
    }
    
    // Process in chunks to demonstrate oversubscription
    int device;
    cudaGetDevice(&device);
    
    size_t chunk_size = n / 4;  // Process 1/4 at a time
    
    for (int chunk = 0; chunk < 4; chunk++) {
        size_t start = chunk * chunk_size;
        
        // Prefetch this chunk to GPU
        cudaMemPrefetchAsync(&data[start], chunk_size * sizeof(float), device);
        
        // Process chunk
        processChunk<<<(chunk_size+255)/256, 256>>>(data, start, chunk_size);
        
        printf("Processed chunk %d\n", chunk);
    }
    
    cudaDeviceSynchronize();
    
    // Prefetch result back to CPU
    cudaMemPrefetchAsync(data, size, cudaCpuDeviceId);
    cudaDeviceSynchronize();
    
    printf("data[0] = %f (expected sqrt(0) = 0)\n", data[0]);
    printf("data[1] = %f (expected sqrt(1) = 1)\n", data[1]);
    
    cudaFree(data);
    return 0;
}
```

---

## Exercises

### Exercise 1: Compare Performance

In [None]:
# TODO: Compare unified memory (with/without prefetch) vs explicit memory

@cuda.jit
def compute_kernel(data, result):
    idx = cuda.grid(1)
    if idx < result.size:
        x = data[idx]
        result[idx] = x * x + x

def benchmark_memory_approaches(n=10_000_000):
    """Compare different memory management approaches."""
    # TODO: Implement benchmarks for:
    # 1. Explicit memory with cudaMemcpy
    # 2. Unified memory without prefetch
    # 3. Unified memory with prefetch
    pass

---

## Summary

### Unified Memory API

```cpp
// Allocation
cudaMallocManaged(&ptr, size);

// Prefetching
cudaMemPrefetchAsync(ptr, size, device);     // To GPU
cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId); // To CPU

// Hints
cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, device);
cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device);
cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device);
```

### Decision Guide

| Scenario | Recommendation |
|----------|----------------|
| Prototyping | Unified memory |
| Complex data structures | Unified memory |
| Maximum performance | Explicit + overlapping |
| Data > GPU memory | Unified + prefetch |
| Production code | Unified + hints (or explicit) |

### Week 7 Complete!
Next week: Profiling & Analysis with Nsight tools.