In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Pinned Memory

### Why Pinned Memory?

```
Regular (Pageable) Memory:
━━━━━━━━━━━━━━━━━━━━━━━━━━━
┌──────────┐     ┌──────────┐     ┌──────────┐
│   Host   │ --> │  Pinned  │ --> │  Device  │
│ (paged)  │     │ (staging)│     │ (GPU)    │
└──────────┘     └──────────┘     └──────────┘
                  Hidden copy!

Pinned (Page-Locked) Memory:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
┌──────────┐ ────────────────> ┌──────────┐
│  Pinned  │   Direct DMA!     │  Device  │
│  (host)  │                   │  (GPU)   │
└──────────┘                   └──────────┘

Benefits:
• Faster transfers (no staging buffer)
• Required for async copies
• Enables overlap with compute

Drawbacks:
• Limited resource (can't swap to disk)
• Reduces memory for other apps
• Slower to allocate
```

### CUDA C++ Pinned Memory (Primary)

```cpp
// pinned_memory.cu - Pinned memory allocation and transfers
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    const size_t SIZE = 1 << 26;  // 64MB
    const size_t BYTES = SIZE * sizeof(float);
    
    float *h_pageable, *h_pinned;
    float *d_data;
    
    // ============================================
    // Allocate Host Memory
    // ============================================
    
    // Regular pageable memory
    h_pageable = (float*)malloc(BYTES);
    
    // Pinned (page-locked) memory
    cudaMallocHost(&h_pinned, BYTES);  // Or cudaHostAlloc
    
    // Alternative with flags:
    // cudaHostAlloc(&h_pinned, BYTES, cudaHostAllocDefault);
    // Flags: cudaHostAllocPortable, cudaHostAllocMapped, cudaHostAllocWriteCombined
    
    cudaMalloc(&d_data, BYTES);
    
    // Initialize
    for (size_t i = 0; i < SIZE; i++) {
        h_pageable[i] = 1.0f;
        h_pinned[i] = 1.0f;
    }
    
    // ============================================
    // Compare Transfer Speeds
    // ============================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float ms;
    
    // Pageable transfer (synchronous only!)
    cudaEventRecord(start);
    cudaMemcpy(d_data, h_pageable, BYTES, cudaMemcpyHostToDevice);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    printf("Pageable H2D: %.2f ms (%.2f GB/s)\n", 
           ms, BYTES / ms / 1e6);
    
    // Pinned transfer (can be async!)
    cudaEventRecord(start);
    cudaMemcpy(d_data, h_pinned, BYTES, cudaMemcpyHostToDevice);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    printf("Pinned H2D:   %.2f ms (%.2f GB/s)\n", 
           ms, BYTES / ms / 1e6);
    
    // Cleanup
    free(h_pageable);
    cudaFreeHost(h_pinned);  // Must use cudaFreeHost!
    cudaFree(d_data);
    
    return 0;
}
```

---

## Part 2: Async Memory Copies

### CUDA C++ cudaMemcpyAsync (Primary)

```cpp
// async_memcpy.cu - Asynchronous memory transfers
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processKernel(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        // Simulate heavy computation
        float val = data[i];
        for (int j = 0; j < 100; j++) {
            val = sqrtf(val * val + 1.0f);
        }
        data[i] = val;
    }
}

int main() {
    const int N = 1 << 22;  // 4M elements
    const size_t BYTES = N * sizeof(float);
    
    // MUST use pinned memory for async!
    float *h_data;
    cudaMallocHost(&h_data, BYTES);
    
    float *d_data;
    cudaMalloc(&d_data, BYTES);
    
    // Initialize
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // ============================================
    // Synchronous (Blocking)
    // ============================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    // These block until complete:
    cudaMemcpy(d_data, h_data, BYTES, cudaMemcpyHostToDevice);
    processKernel<<<256, 256>>>(d_data, N);
    cudaMemcpy(h_data, d_data, BYTES, cudaMemcpyDeviceToHost);
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float syncTime;
    cudaEventElapsedTime(&syncTime, start, stop);
    
    // ============================================
    // Asynchronous (Non-Blocking)
    // ============================================
    cudaEventRecord(start);
    
    // These return immediately to CPU:
    cudaMemcpyAsync(d_data, h_data, BYTES, 
                    cudaMemcpyHostToDevice, stream);
    processKernel<<<256, 256, 0, stream>>>(d_data, N);
    cudaMemcpyAsync(h_data, d_data, BYTES, 
                    cudaMemcpyDeviceToHost, stream);
    
    // Can do CPU work here while GPU is busy!
    
    cudaStreamSynchronize(stream);  // Wait when needed
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float asyncTime;
    cudaEventElapsedTime(&asyncTime, start, stop);
    
    printf("Sync time:  %.2f ms\n", syncTime);
    printf("Async time: %.2f ms\n", asyncTime);
    
    cudaStreamDestroy(stream);
    cudaFreeHost(h_data);
    cudaFree(d_data);
    
    return 0;
}
```

---

## Part 3: The Overlap Pattern

### Chunked Processing for Overlap

```
Without Chunking (No Overlap):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
H2D:    ████████████████████████████████████████
Compute:                                        ████████████████████
D2H:                                                                ████████████

With Chunking (Overlapped):
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Stream0: [H2D][Compute][D2H]
Stream1:      [H2D][Compute][D2H]
Stream2:           [H2D][Compute][D2H]
Stream3:                [H2D][Compute][D2H]

Timeline:  ████████████████████████████████████
           (H2D, Compute, and D2H overlap!)
```

### CUDA C++ Overlap Implementation (Primary)

```cpp
// overlap_pattern.cu - Overlapping transfers with computation
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        float val = data[tid];
        for (int i = 0; i < 500; i++) {
            val = sinf(val) * cosf(val) + 1.0f;
        }
        data[tid] = val;
    }
}

int main() {
    const int N = 1 << 24;  // 16M elements
    const int NUM_STREAMS = 4;
    const int CHUNK_SIZE = N / NUM_STREAMS;
    const size_t BYTES = N * sizeof(float);
    const size_t CHUNK_BYTES = CHUNK_SIZE * sizeof(float);
    
    // Pinned host memory
    float *h_input, *h_output;
    cudaMallocHost(&h_input, BYTES);
    cudaMallocHost(&h_output, BYTES);
    
    // Device memory (separate for each stream for true overlap)
    float* d_data[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaMalloc(&d_data[i], CHUNK_BYTES);
    }
    
    // Create streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamCreate(&streams[i]);
    }
    
    // Initialize input
    for (int i = 0; i < N; i++) h_input[i] = 1.0f;
    
    // ============================================
    // VERSION 1: No Overlap (Baseline)
    // ============================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * CHUNK_SIZE;
        
        // All in default stream - sequential!
        cudaMemcpy(d_data[i], h_input + offset, CHUNK_BYTES, 
                   cudaMemcpyHostToDevice);
        process<<<(CHUNK_SIZE+255)/256, 256>>>(d_data[i], CHUNK_SIZE);
        cudaMemcpy(h_output + offset, d_data[i], CHUNK_BYTES, 
                   cudaMemcpyDeviceToHost);
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float noOverlapTime;
    cudaEventElapsedTime(&noOverlapTime, start, stop);
    
    // ============================================
    // VERSION 2: With Overlap
    // ============================================
    cudaEventRecord(start);
    
    // Issue all operations for all streams
    for (int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * CHUNK_SIZE;
        
        cudaMemcpyAsync(d_data[i], h_input + offset, CHUNK_BYTES,
                        cudaMemcpyHostToDevice, streams[i]);
        
        process<<<(CHUNK_SIZE+255)/256, 256, 0, streams[i]>>>(
            d_data[i], CHUNK_SIZE);
        
        cudaMemcpyAsync(h_output + offset, d_data[i], CHUNK_BYTES,
                        cudaMemcpyDeviceToHost, streams[i]);
    }
    
    // Synchronize all streams
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamSynchronize(streams[i]);
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float overlapTime;
    cudaEventElapsedTime(&overlapTime, start, stop);
    
    printf("Without overlap: %.2f ms\n", noOverlapTime);
    printf("With overlap:    %.2f ms\n", overlapTime);
    printf("Speedup:         %.2fx\n", noOverlapTime / overlapTime);
    
    // Cleanup
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamDestroy(streams[i]);
        cudaFree(d_data[i]);
    }
    cudaFreeHost(h_input);
    cudaFreeHost(h_output);
    
    return 0;
}
```

In [None]:
# Python/Numba Async Transfer Demo (OPTIONAL)

@cuda.jit
def process(data):
    tid = cuda.grid(1)
    if tid < data.shape[0]:
        val = data[tid]
        for _ in range(100):
            val = val * 1.001 + 0.001
        data[tid] = val

n = 1 << 22
num_streams = 4
chunk = n // num_streams

# Create streams
streams = [cuda.stream() for _ in range(num_streams)]

# Pinned host arrays
h_input = cuda.pinned_array(n, dtype=np.float32)
h_output = cuda.pinned_array(n, dtype=np.float32)
h_input[:] = 1.0

# Device arrays (one per stream)
d_chunks = [cuda.device_array(chunk, dtype=np.float32) for _ in range(num_streams)]

# Launch overlapped work
start = time.time()

for i in range(num_streams):
    offset = i * chunk
    
    # Async copy H2D
    d_chunks[i].copy_to_device(h_input[offset:offset+chunk], stream=streams[i])
    
    # Kernel
    process[(chunk+255)//256, 256, streams[i]](d_chunks[i])
    
    # Async copy D2H
    d_chunks[i].copy_to_host(h_output[offset:offset+chunk], stream=streams[i])

# Sync all
for s in streams:
    s.synchronize()

elapsed = time.time() - start
print(f"Overlapped time: {elapsed*1000:.2f} ms")

---

## Part 4: Profiling Overlap with Nsight Systems

```bash
# Profile to see overlap
nsys profile -o overlap_timeline ./overlap_pattern

# View in Nsight Systems GUI
nsys-ui overlap_timeline.nsys-rep
```

### What to Look For

```
Nsight Systems Timeline:
━━━━━━━━━━━━━━━━━━━━━━━━
                    Without Overlap          With Overlap
                    ─────────────────        ─────────────────
Copy Engine H2D:    ████░░░░░░░░░░░░░        ████░░░░
Compute:            ░░░░████░░░░░░░░░        ░░██████████
Copy Engine D2H:    ░░░░░░░░████░░░░░        ░░░░░░░░████
                    ↑ Sequential!            ↑ Overlapped!

Look for:
• Parallel bars in different rows = overlap achieved
• Gaps between operations = potential improvement
• Copy/compute at same time = good utilization
```

---

## Exercises

### Exercise 1: Pinned Memory Benchmark
Compare transfer speeds for various sizes with pageable vs pinned memory.

### Exercise 2: Find Optimal Chunk Count
Test with 2, 4, 8, 16 streams and find the sweet spot for your GPU.

### Exercise 3: Double Buffering
```cpp
// Implement double buffering:
// - While processing chunk N, transfer chunk N+1
// - Use only 2 device buffers, alternate between them
```

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│              OVERLAPPING TRANSFERS                      │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Pinned Memory:                                         │
│  • cudaMallocHost() / cudaFreeHost()                    │
│  • Required for async transfers                         │
│  • Faster than pageable (no staging)                    │
│                                                         │
│  Async Copies:                                          │
│  • cudaMemcpyAsync(dst, src, size, kind, stream)        │
│  • Returns immediately to CPU                           │
│  • Operations in same stream execute in order           │
│                                                         │
│  Overlap Pattern:                                       │
│  • Split data into chunks                               │
│  • Each chunk in different stream                       │
│  • H2D, compute, D2H can all overlap                    │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 3 - Multi-Stream Execution