In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("‚ö†Ô∏è  CUDA C++ is the PRIMARY learning material!")

---

## 1. Traditional vs Unified Memory

Let's compare explicit memory management with unified memory to understand the tradeoffs.

### Traditional Memory (Explicit Transfers)

In [None]:
%%writefile traditional_memory.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s at line %d\n", \
                    cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

__global__ void saxpy(float *y, const float *x, float a, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        y[i] = a * x[i] + y[i];
    }
}

int main() {
    const int N = 1 << 20;  // 1M elements
    const float a = 2.0f;
    size_t bytes = N * sizeof(float);
    
    // Host memory
    float *h_x = (float*)malloc(bytes);
    float *h_y = (float*)malloc(bytes);
    
    // Initialize on host
    for (int i = 0; i < N; i++) {
        h_x[i] = 1.0f;
        h_y[i] = 2.0f;
    }
    
    // Device memory
    float *d_x, *d_y;
    CUDA_CHECK(cudaMalloc(&d_x, bytes));
    CUDA_CHECK(cudaMalloc(&d_y, bytes));
    
    // Timing events
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    CUDA_CHECK(cudaEventRecord(start));
    
    // Step 1: Copy H2D
    CUDA_CHECK(cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_y, h_y, bytes, cudaMemcpyHostToDevice));
    
    // Step 2: Kernel
    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    saxpy<<<blocks, threads>>>(d_y, d_x, a, N);
    
    // Step 3: Copy D2H
    CUDA_CHECK(cudaMemcpy(h_y, d_y, bytes, cudaMemcpyDeviceToHost));
    
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));
    
    float ms;
    CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
    
    // Verify
    float expected = a * 1.0f + 2.0f;  // 4.0
    int errors = 0;
    for (int i = 0; i < N && errors < 5; i++) {
        if (fabsf(h_y[i] - expected) > 1e-5) errors++;
    }
    
    printf("=== Traditional Memory (Explicit Transfers) ===\n");
    printf("Total time: %.3f ms\n", ms);
    printf("Errors: %d\n", errors);
    
    // Cleanup
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    CUDA_CHECK(cudaFree(d_x));
    CUDA_CHECK(cudaFree(d_y));
    free(h_x);
    free(h_y);
    
    return 0;
}

In [None]:
!nvcc -o traditional_memory traditional_memory.cu && ./traditional_memory

### Unified Memory (Automatic Migration)

In [None]:
%%writefile unified_memory.cu
#include <cuda_runtime.h>
#include <stdio.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s at line %d\n", \
                    cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

__global__ void saxpy(float *y, const float *x, float a, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        y[i] = a * x[i] + y[i];
    }
}

int main() {
    const int N = 1 << 20;  // 1M elements
    const float a = 2.0f;
    size_t bytes = N * sizeof(float);
    
    // Unified memory - accessible from both CPU and GPU
    float *x, *y;
    CUDA_CHECK(cudaMallocManaged(&x, bytes));
    CUDA_CHECK(cudaMallocManaged(&y, bytes));
    
    // Initialize on host - data will migrate to GPU on first kernel access
    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
    
    // Timing events
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    CUDA_CHECK(cudaEventRecord(start));
    
    // Kernel - data migrates automatically
    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    saxpy<<<blocks, threads>>>(y, x, a, N);
    
    // Synchronize before host access
    CUDA_CHECK(cudaDeviceSynchronize());
    
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));
    
    float ms;
    CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
    
    // Verify - data migrates back to host
    float expected = a * 1.0f + 2.0f;  // 4.0
    int errors = 0;
    for (int i = 0; i < N && errors < 5; i++) {
        if (fabsf(y[i] - expected) > 1e-5) errors++;
    }
    
    printf("=== Unified Memory (Automatic Migration) ===\n");
    printf("Total time: %.3f ms\n", ms);
    printf("Errors: %d\n", errors);
    printf("\nNOTE: First run may be slower due to page faults.\n");
    printf("      Use prefetching for better performance!\n");
    
    // Cleanup
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    CUDA_CHECK(cudaFree(x));
    CUDA_CHECK(cudaFree(y));
    
    return 0;
}

In [None]:
!nvcc -o unified_memory unified_memory.cu && ./unified_memory

### Key Differences

| Aspect | Traditional | Unified |
|--------|-------------|----------|
| Allocation | `cudaMalloc` + `malloc` | `cudaMallocManaged` |
| Transfers | Explicit `cudaMemcpy` | Automatic on-demand |
| Code simplicity | More complex | Simpler |
| Performance | Predictable | May have page faults |
| Control | Full control | Driver-managed |

---

## 2. Prefetching for Performance

Unified memory can suffer from page faults on first access. **Prefetching** moves data proactively to avoid this overhead.

In [None]:
%%writefile prefetch_demo.cu
#include <cuda_runtime.h>
#include <stdio.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s at line %d\n", \
                    cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

__global__ void square(float *data, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        data[i] = data[i] * data[i];
    }
}

float benchmark_no_prefetch(float *data, int N, int device) {
    // Reset data on CPU
    for (int i = 0; i < N; i++) data[i] = 2.0f;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    // No prefetch - page faults on GPU access
    square<<<(N + 255) / 256, 256>>>(data, N);
    cudaDeviceSynchronize();
    
    // Access on CPU - more page faults
    volatile float sum = 0;
    for (int i = 0; i < 1000; i++) sum += data[i];
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return ms;
}

float benchmark_with_prefetch(float *data, int N, int device) {
    // Reset data on CPU
    for (int i = 0; i < N; i++) data[i] = 2.0f;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    // Prefetch to GPU BEFORE kernel
    cudaMemPrefetchAsync(data, N * sizeof(float), device);
    
    square<<<(N + 255) / 256, 256>>>(data, N);
    cudaDeviceSynchronize();
    
    // Prefetch back to CPU BEFORE host access
    cudaMemPrefetchAsync(data, N * sizeof(float), cudaCpuDeviceId);
    cudaDeviceSynchronize();
    
    volatile float sum = 0;
    for (int i = 0; i < 1000; i++) sum += data[i];
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return ms;
}

int main() {
    const int N = 1 << 22;  // 4M elements
    
    int device;
    CUDA_CHECK(cudaGetDevice(&device));
    
    float *data;
    CUDA_CHECK(cudaMallocManaged(&data, N * sizeof(float)));
    
    printf("=== Prefetching Benchmark ===\n");
    printf("Array size: %d elements (%.1f MB)\n\n", N, N * sizeof(float) / 1e6);
    
    // Warm up
    benchmark_no_prefetch(data, N, device);
    
    // Run benchmarks
    float time_no_prefetch = benchmark_no_prefetch(data, N, device);
    float time_with_prefetch = benchmark_with_prefetch(data, N, device);
    
    printf("Without prefetch: %.3f ms\n", time_no_prefetch);
    printf("With prefetch:    %.3f ms\n", time_with_prefetch);
    printf("Speedup:          %.2fx\n", time_no_prefetch / time_with_prefetch);
    
    CUDA_CHECK(cudaFree(data));
    return 0;
}

In [None]:
!nvcc -o prefetch_demo prefetch_demo.cu && ./prefetch_demo

### Key Prefetch APIs

```cpp
// Prefetch to GPU
cudaMemPrefetchAsync(ptr, size, deviceId, stream);

// Prefetch to CPU
cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream);
```

**Best Practice**: Always prefetch data to where it will be accessed next!

---

## 3. Memory Hints with cudaMemAdvise

Beyond prefetching, you can provide **hints** to the driver about expected access patterns.

In [None]:
%%writefile memory_advise.cu
#include <cuda_runtime.h>
#include <stdio.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s at line %d\n", \
                    cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

__global__ void read_only_kernel(const float* __restrict__ input, 
                                  float* output, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        // Read multiple times - benefits from read-mostly hint
        output[i] = input[i] * 2.0f + input[i] * 3.0f + input[i];
    }
}

int main() {
    const int N = 1 << 20;
    size_t bytes = N * sizeof(float);
    
    int device;
    CUDA_CHECK(cudaGetDevice(&device));
    
    float *input, *output;
    CUDA_CHECK(cudaMallocManaged(&input, bytes));
    CUDA_CHECK(cudaMallocManaged(&output, bytes));
    
    // Initialize input (read-only data)
    for (int i = 0; i < N; i++) {
        input[i] = (float)i;
    }
    
    printf("=== Memory Advise Demo ===\n\n");
    
    // Hint 1: ReadMostly - data will be primarily read, not written
    // Creates read-only copies on GPU, original stays on CPU
    CUDA_CHECK(cudaMemAdvise(input, bytes, cudaMemAdviseSetReadMostly, device));
    printf("Applied cudaMemAdviseSetReadMostly to input array\n");
    printf("  -> Driver may create read-only replicas on GPU\n\n");
    
    // Hint 2: PreferredLocation - suggest where data should reside
    CUDA_CHECK(cudaMemAdvise(output, bytes, cudaMemAdviseSetPreferredLocation, device));
    printf("Applied cudaMemAdviseSetPreferredLocation(GPU) to output array\n");
    printf("  -> Output will preferentially reside on GPU\n\n");
    
    // Prefetch for best performance
    CUDA_CHECK(cudaMemPrefetchAsync(input, bytes, device));
    CUDA_CHECK(cudaMemPrefetchAsync(output, bytes, device));
    
    // Timing
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    CUDA_CHECK(cudaEventRecord(start));
    
    read_only_kernel<<<(N + 255) / 256, 256>>>(input, output, N);
    
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));
    
    float ms;
    CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
    
    // Verify first few results
    CUDA_CHECK(cudaMemPrefetchAsync(output, bytes, cudaCpuDeviceId));
    CUDA_CHECK(cudaDeviceSynchronize());
    
    printf("Kernel time: %.3f ms\n\n", ms);
    printf("Verification (first 5 elements):\n");
    for (int i = 0; i < 5; i++) {
        float expected = input[i] * 2.0f + input[i] * 3.0f + input[i];  // 6 * input[i]
        printf("  output[%d] = %.1f (expected: %.1f)\n", i, output[i], expected);
    }
    
    // Cleanup - clear hints before freeing
    CUDA_CHECK(cudaMemAdvise(input, bytes, cudaMemAdviseUnsetReadMostly, device));
    CUDA_CHECK(cudaMemAdvise(output, bytes, cudaMemAdviseUnsetPreferredLocation, device));
    
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    CUDA_CHECK(cudaFree(input));
    CUDA_CHECK(cudaFree(output));
    
    return 0;
}

In [None]:
!nvcc -o memory_advise memory_advise.cu && ./memory_advise

### Available Memory Hints

| Hint | Description | Use Case |
|------|-------------|----------|
| `cudaMemAdviseSetReadMostly` | Data is read frequently, rarely written | Lookup tables, constants |
| `cudaMemAdviseSetPreferredLocation` | Prefer specific device/CPU | Control where data resides |
| `cudaMemAdviseSetAccessedBy` | Device will access this memory | Enable direct access mapping |

---

## 4. Oversubscription - Using More Memory Than GPU Has

Unified memory enables **oversubscription**: allocating more memory than physically available on GPU. The driver pages data in and out as needed.

In [None]:
%%writefile oversubscription.cu
#include <cuda_runtime.h>
#include <stdio.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s at line %d\n", \
                    cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

__global__ void process_chunk(float* data, size_t offset, size_t count) {
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < count) {
        data[offset + i] = data[offset + i] * 2.0f + 1.0f;
    }
}

int main() {
    // Query GPU memory
    int device;
    CUDA_CHECK(cudaGetDevice(&device));
    
    size_t free_mem, total_mem;
    CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
    
    printf("=== Oversubscription Demo ===\n\n");
    printf("GPU Memory: %.2f GB total, %.2f GB free\n\n",
           total_mem / 1e9, free_mem / 1e9);
    
    // Allocate 25% of GPU memory (safe for demo)
    // In real oversubscription, you'd allocate MORE than total_mem
    size_t alloc_size = total_mem / 4;
    size_t n_elements = alloc_size / sizeof(float);
    
    printf("Allocating %.2f GB (%.0f million floats)\n",
           alloc_size / 1e9, n_elements / 1e6);
    
    float* data;
    CUDA_CHECK(cudaMallocManaged(&data, alloc_size));
    
    // Initialize on CPU (data starts on CPU)
    printf("Initializing data on CPU...\n");
    for (size_t i = 0; i < n_elements; i++) {
        data[i] = 1.0f;
    }
    
    // Process in chunks - simulate streaming access pattern
    // This is how oversubscription works: process chunks that fit in GPU memory
    size_t chunk_size = n_elements / 4;
    int threads = 256;
    int blocks = (chunk_size + threads - 1) / threads;
    
    printf("\nProcessing in 4 chunks...\n");
    
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    CUDA_CHECK(cudaEventRecord(start));
    
    for (int chunk = 0; chunk < 4; chunk++) {
        size_t offset = chunk * chunk_size;
        
        // Prefetch this chunk to GPU
        CUDA_CHECK(cudaMemPrefetchAsync(data + offset, 
                                         chunk_size * sizeof(float), 
                                         device));
        
        // Process chunk
        process_chunk<<<blocks, threads>>>(data, offset, chunk_size);
        
        // Prefetch back to CPU (for next iteration or final access)
        CUDA_CHECK(cudaMemPrefetchAsync(data + offset,
                                         chunk_size * sizeof(float),
                                         cudaCpuDeviceId));
    }
    
    CUDA_CHECK(cudaDeviceSynchronize());
    
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));
    
    float ms;
    CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop));
    
    // Verify
    float expected = 1.0f * 2.0f + 1.0f;  // 3.0
    int errors = 0;
    for (size_t i = 0; i < n_elements && errors < 5; i++) {
        if (fabsf(data[i] - expected) > 1e-5) errors++;
    }
    
    printf("\nTotal time: %.1f ms\n", ms);
    printf("Throughput: %.2f GB/s\n", (alloc_size * 2) / (ms / 1000) / 1e9);
    printf("Errors: %d\n", errors);
    
    printf("\n--- Oversubscription Key Points ---\n");
    printf("1. cudaMallocManaged can exceed GPU memory\n");
    printf("2. Driver pages data in/out automatically\n");
    printf("3. Streaming access patterns work best\n");
    printf("4. Use prefetching to control paging\n");
    
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    CUDA_CHECK(cudaFree(data));
    
    return 0;
}

In [None]:
!nvcc -o oversubscription oversubscription.cu && ./oversubscription

---

## 5. Comprehensive Example: Optimized Unified Memory Pipeline

In [None]:
%%writefile um_pipeline.cu
#include <cuda_runtime.h>
#include <stdio.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA Error: %s at line %d\n", \
                    cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

// Kernel 1: Normalize data
__global__ void normalize(float* data, float max_val, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        data[i] = data[i] / max_val;
    }
}

// Kernel 2: Apply ReLU
__global__ void relu(float* data, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        data[i] = data[i] > 0.0f ? data[i] : 0.0f;
    }
}

// Kernel 3: Scale and offset
__global__ void scale_offset(float* data, float scale, float offset, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        data[i] = data[i] * scale + offset;
    }
}

int main() {
    const int N = 1 << 22;  // 4M elements
    size_t bytes = N * sizeof(float);
    
    int device;
    CUDA_CHECK(cudaGetDevice(&device));
    
    float* data;
    CUDA_CHECK(cudaMallocManaged(&data, bytes));
    
    printf("=== Optimized Unified Memory Pipeline ===\n\n");
    
    // Initialize with random-ish data (some negative)
    for (int i = 0; i < N; i++) {
        data[i] = (float)(i % 1000) - 500.0f;  // Range: -500 to 499
    }
    
    // Step 1: Set memory hints
    // Data will be accessed primarily by GPU
    CUDA_CHECK(cudaMemAdvise(data, bytes, 
                              cudaMemAdviseSetPreferredLocation, device));
    printf("1. Set preferred location to GPU\n");
    
    // Step 2: Prefetch before kernel launch
    CUDA_CHECK(cudaMemPrefetchAsync(data, bytes, device));
    printf("2. Prefetched data to GPU\n");
    
    int threads = 256;
    int blocks = (N + threads - 1) / threads;
    
    // Timing
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));
    
    CUDA_CHECK(cudaEventRecord(start));
    
    // Step 3: Execute kernel pipeline
    // All kernels run on GPU - no data migration between kernels!
    normalize<<<blocks, threads>>>(data, 500.0f, N);  // -> [-1, 1)
    relu<<<blocks, threads>>>(data, N);              // -> [0, 1)
    scale_offset<<<blocks, threads>>>(data, 2.0f, -1.0f, N);  // -> [-1, 1)
    
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));
    
    float kernel_ms;
    CUDA_CHECK(cudaEventElapsedTime(&kernel_ms, start, stop));
    
    printf("3. Executed 3-kernel pipeline\n");
    printf("   Kernel time: %.3f ms\n\n", kernel_ms);
    
    // Step 4: Prefetch back to CPU for verification
    CUDA_CHECK(cudaMemPrefetchAsync(data, bytes, cudaCpuDeviceId));
    CUDA_CHECK(cudaDeviceSynchronize());
    printf("4. Prefetched results back to CPU\n\n");
    
    // Verify sample values
    printf("Sample results:\n");
    int samples[] = {0, 100, 250, 500, 750};
    for (int j = 0; j < 5; j++) {
        int i = samples[j];
        float original = (float)(i % 1000) - 500.0f;
        float after_norm = original / 500.0f;
        float after_relu = after_norm > 0 ? after_norm : 0;
        float expected = after_relu * 2.0f - 1.0f;
        printf("  data[%d]: original=%.1f -> result=%.4f (expected=%.4f)\n",
               i, original, data[i], expected);
    }
    
    // Cleanup
    CUDA_CHECK(cudaMemAdvise(data, bytes,
                              cudaMemAdviseUnsetPreferredLocation, device));
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    CUDA_CHECK(cudaFree(data));
    
    printf("\n=== Pipeline Complete ===\n");
    
    return 0;
}

In [None]:
!nvcc -o um_pipeline um_pipeline.cu && ./um_pipeline

---

## Summary: Unified Memory Best Practices

### 1. Use Prefetching
```cpp
cudaMemPrefetchAsync(ptr, size, device);      // Before GPU kernel
cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId);  // Before CPU access
```

### 2. Provide Memory Hints
```cpp
cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, device);  // Read-only data
cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device);  // GPU-resident
```

### 3. Choose Wisely

| Scenario | Recommendation |
|----------|----------------|
| Simple prototyping | Basic unified memory |
| Performance critical | Prefetching + hints |
| Maximum control | Traditional explicit transfers |
| Large datasets | Oversubscription with chunked access |

---

## Exercises

### Exercise 1: Prefetch Optimization
Modify the unified memory example to add prefetching and measure the improvement.

### Exercise 2: Read-Mostly Pattern
Create an example with lookup table data using `cudaMemAdviseSetReadMostly`.

### Exercise 3: Multi-Kernel Pipeline
Build a 5-stage processing pipeline and optimize with proper prefetching.

In [None]:
# Cleanup generated files
!rm -f traditional_memory unified_memory prefetch_demo memory_advise oversubscription um_pipeline *.cu

---

## Part 1: Unified Memory Basics

### What is Unified Memory?

```
Traditional CUDA:             Unified Memory:

‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê              ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ   CPU        ‚îÇ              ‚îÇ   CPU        ‚îÇ
‚îÇ   Memory     ‚îÇ              ‚îÇ              ‚îÇ
‚îÇ   h_data     ‚îÇ              ‚îÇ              ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò              ‚îÇ              ‚îÇ
       ‚îÇ cudaMemcpy()         ‚îÇ   Unified    ‚îÇ
       ‚Üì                      ‚îÇ   Address    ‚îÇ
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê              ‚îÇ   Space      ‚îÇ
‚îÇ   GPU        ‚îÇ              ‚îÇ              ‚îÇ
‚îÇ   Memory     ‚îÇ              ‚îÇ   data       ‚îÇ ‚Üê One pointer!
‚îÇ   d_data     ‚îÇ              ‚îÇ              ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò              ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

Two pointers,                 One pointer,
explicit copies               automatic migration
```

### üî∑ CUDA C++ Implementation (Primary)

```cpp
// unified_memory.cu - Unified memory basics
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void addKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] += 1.0f;
    }
}

int main() {
    int n = 1 << 20;  // 1M elements
    size_t size = n * sizeof(float);
    
    // ========== TRADITIONAL APPROACH ==========
    {
        float *h_data = (float*)malloc(size);
        float *d_data;
        cudaMalloc(&d_data, size);
        
        // Initialize on host
        for (int i = 0; i < n; i++) h_data[i] = i;
        
        // Copy to device
        cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
        
        // Launch kernel
        addKernel<<<(n+255)/256, 256>>>(d_data, n);
        
        // Copy back
        cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
        
        printf("Traditional: h_data[0] = %f\n", h_data[0]);
        
        free(h_data);
        cudaFree(d_data);
    }
    
    // ========== UNIFIED MEMORY APPROACH ==========
    {
        float *data;
        cudaMallocManaged(&data, size);  // One allocation!
        
        // Initialize on host (no copy needed!)
        for (int i = 0; i < n; i++) data[i] = i;
        
        // Launch kernel (no copy needed!)
        addKernel<<<(n+255)/256, 256>>>(data, n);
        cudaDeviceSynchronize();
        
        // Use on host (no copy needed!)
        printf("Unified: data[0] = %f\n", data[0]);
        
        cudaFree(data);
    }
    
    return 0;
}
```

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
# Python/Numba - Managed memory example

@cuda.jit
def add_one(data):
    idx = cuda.grid(1)
    if idx < data.size:
        data[idx] += 1.0

# Using managed memory (simplified API)
def unified_memory_demo():
    n = 1_000_000
    
    # Create managed array
    # Note: Numba handles this through cuda.to_device or 
    # cuda.managed_array (if available)
    
    # Standard approach (for comparison)
    host_data = np.arange(n, dtype=np.float32)
    device_data = cuda.to_device(host_data)
    
    block = 256
    grid = (n + block - 1) // block
    
    add_one[grid, block](device_data)
    cuda.synchronize()
    
    result = device_data.copy_to_host()
    print(f"First elements: {result[:5]}")
    print(f"Expected: [1. 2. 3. 4. 5.]")

unified_memory_demo()

---

## Part 2: Page Migration

In [None]:
def explain_page_migration():
    """Explain how unified memory page migration works."""
    print("Unified Memory Page Migration")
    print("=" * 60)
    print()
    print("How it works:")
    print("  1. Memory allocated as 'managed' pages")
    print("  2. Pages migrate on demand (page fault)")
    print("  3. OS/driver handles migration transparently")
    print()
    print("Page fault flow:")
    print("  GPU kernel accesses page ‚Üí Page not on GPU")
    print("  ‚Üí Page fault triggered ‚Üí Migration from CPU to GPU")
    print("  ‚Üí Kernel resumes with page now on GPU")
    print()
    print("Page sizes:")
    print("  CPU: 4 KB (standard) or 2 MB (huge pages)")
    print("  GPU: 64 KB (Pascal+) or 2 MB (large page mode)")
    print()
    print("Migration overhead:")
    print("  - Page fault handling: ~20-50 ¬µs")
    print("  - Data transfer: depends on page size and PCIe/NVLink")
    print("  - Can be significant for random access patterns!")

explain_page_migration()

### üî∑ Prefetching to Avoid Page Faults

```cpp
// prefetch.cu - Prefetching for better performance
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = sqrtf(data[idx]);
    }
}

int main() {
    int n = 1 << 24;  // 16M elements
    size_t size = n * sizeof(float);
    
    float *data;
    cudaMallocManaged(&data, size);
    
    // Initialize on CPU
    for (int i = 0; i < n; i++) data[i] = (float)i;
    
    int device;
    cudaGetDevice(&device);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // ===== WITHOUT PREFETCH =====
    cudaEventRecord(start);
    processKernel<<<(n+255)/256, 256>>>(data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms_no_prefetch;
    cudaEventElapsedTime(&ms_no_prefetch, start, stop);
    printf("Without prefetch: %.2f ms\n", ms_no_prefetch);
    
    // Reset data to CPU
    cudaMemPrefetchAsync(data, size, cudaCpuDeviceId);
    cudaDeviceSynchronize();
    for (int i = 0; i < n; i++) data[i] = (float)i;
    
    // ===== WITH PREFETCH =====
    cudaEventRecord(start);
    cudaMemPrefetchAsync(data, size, device);  // Prefetch to GPU
    processKernel<<<(n+255)/256, 256>>>(data, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms_with_prefetch;
    cudaEventElapsedTime(&ms_with_prefetch, start, stop);
    printf("With prefetch: %.2f ms\n", ms_with_prefetch);
    
    printf("Speedup: %.2fx\n", ms_no_prefetch / ms_with_prefetch);
    
    cudaFree(data);
    return 0;
}
```

---

## Part 3: Memory Hints

In [None]:
def memory_hints():
    """Explain CUDA memory advise hints."""
    print("cudaMemAdvise Hints")
    print("=" * 60)
    print()
    print("cudaMemAdviseSetReadMostly")
    print("  - Hint: Data will be read, rarely written")
    print("  - Effect: May duplicate to avoid migration")
    print("  - Use: Lookup tables, constant data")
    print()
    print("cudaMemAdviseSetPreferredLocation")
    print("  - Hint: Preferred location for data")
    print("  - Effect: Tries to keep data at specified location")
    print("  - Use: Data primarily used by one processor")
    print()
    print("cudaMemAdviseSetAccessedBy")
    print("  - Hint: Which devices will access data")
    print("  - Effect: Creates direct mapping if possible")
    print("  - Use: Multi-GPU scenarios")
    print()
    print("Example usage:")
    print("  cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, 0);")
    print("  cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device);")
    print("  cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device);")

memory_hints()

### üî∑ CUDA C++ Memory Advise

```cpp
// advise.cu - Using memory hints
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void lookupKernel(const float* table, const int* indices,
                              float* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        output[idx] = table[indices[idx]];
    }
}

int main() {
    int tableSize = 1 << 20;   // 1M lookup table
    int n = 1 << 24;           // 16M lookups
    
    float *table;
    int *indices;
    float *output;
    
    cudaMallocManaged(&table, tableSize * sizeof(float));
    cudaMallocManaged(&indices, n * sizeof(int));
    cudaMallocManaged(&output, n * sizeof(float));
    
    // Initialize
    for (int i = 0; i < tableSize; i++) table[i] = sqrtf(i);
    for (int i = 0; i < n; i++) indices[i] = rand() % tableSize;
    
    int device;
    cudaGetDevice(&device);
    
    // ===== APPLY HINTS =====
    
    // Table is read-only - can be duplicated
    cudaMemAdvise(table, tableSize * sizeof(float),
                  cudaMemAdviseSetReadMostly, 0);
    
    // Indices and output should prefer GPU
    cudaMemAdvise(indices, n * sizeof(int),
                  cudaMemAdviseSetPreferredLocation, device);
    cudaMemAdvise(output, n * sizeof(float),
                  cudaMemAdviseSetPreferredLocation, device);
    
    // Prefetch to GPU
    cudaMemPrefetchAsync(table, tableSize * sizeof(float), device);
    cudaMemPrefetchAsync(indices, n * sizeof(int), device);
    
    // Launch kernel
    lookupKernel<<<(n+255)/256, 256>>>(table, indices, output, n);
    cudaDeviceSynchronize();
    
    // Prefetch output back to CPU for verification
    cudaMemPrefetchAsync(output, n * sizeof(float), cudaCpuDeviceId);
    cudaDeviceSynchronize();
    
    printf("output[0] = %f (expected: %f)\n", 
           output[0], table[indices[0]]);
    
    cudaFree(table);
    cudaFree(indices);
    cudaFree(output);
    
    return 0;
}
```

---

## Part 4: When to Use Unified Memory

In [None]:
def unified_vs_explicit():
    """Compare unified vs explicit memory management."""
    print("Unified vs Explicit Memory")
    print("=" * 60)
    print()
    print("USE UNIFIED MEMORY WHEN:")
    print("  ‚úì Prototyping and development")
    print("  ‚úì Complex data structures (linked lists, trees)")
    print("  ‚úì Oversubscription (data larger than GPU memory)")
    print("  ‚úì Unclear access patterns")
    print("  ‚úì Porting CPU code quickly")
    print()
    print("USE EXPLICIT MEMORY WHEN:")
    print("  ‚úì Maximum performance critical")
    print("  ‚úì Predictable access patterns")
    print("  ‚úì Frequent CPU-GPU ping-pong")
    print("  ‚úì Fine-grained control needed")
    print("  ‚úì Overlapping compute and transfer")
    print()
    print("PERFORMANCE CONSIDERATIONS:")
    print("  - Page faults have ~20-50 ¬µs overhead each")
    print("  - First access triggers migration")
    print("  - Random access patterns = many page faults")
    print("  - Prefetching mitigates most overhead")
    print("  - With proper hints, ~95%+ of explicit performance")

unified_vs_explicit()

---

## Part 5: Oversubscription

In [None]:
def explain_oversubscription():
    """Explain memory oversubscription with unified memory."""
    print("Memory Oversubscription")
    print("=" * 60)
    print()
    print("Traditional CUDA:")
    print("  GPU memory = hard limit")
    print("  cudaMalloc fails if not enough memory")
    print()
    print("Unified Memory (Pascal+):")
    print("  Can allocate more than GPU memory!")
    print("  Pages migrate as needed")
    print("  Works like virtual memory")
    print()
    print("Example:")
    print("  GPU has 8 GB memory")
    print("  Allocate 32 GB with cudaMallocManaged")
    print("  Process 8 GB at a time on GPU")
    print("  Pages swap automatically")
    print()
    print("Caveats:")
    print("  - Performance degrades with thrashing")
    print("  - Need good access locality")
    print("  - Consider prefetch hints")

explain_oversubscription()

### üî∑ CUDA C++ Oversubscription Example

```cpp
// oversubscription.cu - Using more memory than GPU has
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processChunk(float* data, int start, int chunk_size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < chunk_size) {
        data[start + idx] = sqrtf(data[start + idx]);
    }
}

int main() {
    // Query GPU memory
    size_t free_mem, total_mem;
    cudaMemGetInfo(&free_mem, &total_mem);
    printf("GPU Memory: %.1f GB free, %.1f GB total\n",
           free_mem / 1e9, total_mem / 1e9);
    
    // Allocate MORE than GPU memory
    size_t n = total_mem / sizeof(float) * 2;  // 2x GPU memory
    size_t size = n * sizeof(float);
    printf("Allocating %.1f GB (2x GPU memory)\n", size / 1e9);
    
    float *data;
    cudaError_t err = cudaMallocManaged(&data, size);
    if (err != cudaSuccess) {
        printf("Allocation failed: %s\n", cudaGetErrorString(err));
        return 1;
    }
    
    // Initialize on CPU (pages stay on CPU)
    for (size_t i = 0; i < n; i++) {
        data[i] = (float)(i % 1000);
    }
    
    // Process in chunks to demonstrate oversubscription
    int device;
    cudaGetDevice(&device);
    
    size_t chunk_size = n / 4;  // Process 1/4 at a time
    
    for (int chunk = 0; chunk < 4; chunk++) {
        size_t start = chunk * chunk_size;
        
        // Prefetch this chunk to GPU
        cudaMemPrefetchAsync(&data[start], chunk_size * sizeof(float), device);
        
        // Process chunk
        processChunk<<<(chunk_size+255)/256, 256>>>(data, start, chunk_size);
        
        printf("Processed chunk %d\n", chunk);
    }
    
    cudaDeviceSynchronize();
    
    // Prefetch result back to CPU
    cudaMemPrefetchAsync(data, size, cudaCpuDeviceId);
    cudaDeviceSynchronize();
    
    printf("data[0] = %f (expected sqrt(0) = 0)\n", data[0]);
    printf("data[1] = %f (expected sqrt(1) = 1)\n", data[1]);
    
    cudaFree(data);
    return 0;
}
```

---

## Exercises

### Exercise 1: Compare Performance

In [None]:
# TODO: Compare unified memory (with/without prefetch) vs explicit memory

@cuda.jit
def compute_kernel(data, result):
    idx = cuda.grid(1)
    if idx < result.size:
        x = data[idx]
        result[idx] = x * x + x

def benchmark_memory_approaches(n=10_000_000):
    """Compare different memory management approaches."""
    # TODO: Implement benchmarks for:
    # 1. Explicit memory with cudaMemcpy
    # 2. Unified memory without prefetch
    # 3. Unified memory with prefetch
    pass

---

## Summary

### Unified Memory API

```cpp
// Allocation
cudaMallocManaged(&ptr, size);

// Prefetching
cudaMemPrefetchAsync(ptr, size, device);     // To GPU
cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId); // To CPU

// Hints
cudaMemAdvise(ptr, size, cudaMemAdviseSetReadMostly, device);
cudaMemAdvise(ptr, size, cudaMemAdviseSetPreferredLocation, device);
cudaMemAdvise(ptr, size, cudaMemAdviseSetAccessedBy, device);
```

### Decision Guide

| Scenario | Recommendation |
|----------|----------------|
| Prototyping | Unified memory |
| Complex data structures | Unified memory |
| Maximum performance | Explicit + overlapping |
| Data > GPU memory | Unified + prefetch |
| Production code | Unified + hints (or explicit) |

### Week 7 Complete!
Next week: Profiling & Analysis with Nsight tools.