In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("‚ö†Ô∏è  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Pinned Memory

### Why Pinned Memory?

```
Regular (Pageable) Memory:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê     ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê     ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ   Host   ‚îÇ --> ‚îÇ  Pinned  ‚îÇ --> ‚îÇ  Device  ‚îÇ
‚îÇ (paged)  ‚îÇ     ‚îÇ (staging)‚îÇ     ‚îÇ (GPU)    ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò     ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò     ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
                  Hidden copy!

Pinned (Page-Locked) Memory:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ> ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ  Pinned  ‚îÇ   Direct DMA!     ‚îÇ  Device  ‚îÇ
‚îÇ  (host)  ‚îÇ                   ‚îÇ  (GPU)   ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò                   ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

Benefits:
‚Ä¢ Faster transfers (no staging buffer)
‚Ä¢ Required for async copies
‚Ä¢ Enables overlap with compute

Drawbacks:
‚Ä¢ Limited resource (can't swap to disk)
‚Ä¢ Reduces memory for other apps
‚Ä¢ Slower to allocate
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile pinned_memory.cu
// pinned_memory.cu - Pinned memory allocation and transfers
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    const size_t SIZE = 1 << 26;  // 64MB
    const size_t BYTES = SIZE * sizeof(float);
    
    float *h_pageable, *h_pinned;
    float *d_data;
    
    // ============================================
    // Allocate Host Memory
    // ============================================
    
    // Regular pageable memory
    h_pageable = (float*)malloc(BYTES);
    
    // Pinned (page-locked) memory
    cudaMallocHost(&h_pinned, BYTES);  // Or cudaHostAlloc
    
    // Alternative with flags:
    // cudaHostAlloc(&h_pinned, BYTES, cudaHostAllocDefault);
    // Flags: cudaHostAllocPortable, cudaHostAllocMapped, cudaHostAllocWriteCombined
    
    cudaMalloc(&d_data, BYTES);
    
    // Initialize
    for (size_t i = 0; i < SIZE; i++) {
        h_pageable[i] = 1.0f;
        h_pinned[i] = 1.0f;
    }
    
    // ============================================
    // Compare Transfer Speeds
    // ============================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float ms;
    
    // Pageable transfer (synchronous only!)
    cudaEventRecord(start);
    cudaMemcpy(d_data, h_pageable, BYTES, cudaMemcpyHostToDevice);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    printf("Pageable H2D: %.2f ms (%.2f GB/s)\n", 
           ms, BYTES / ms / 1e6);
    
    // Pinned transfer (can be async!)
    cudaEventRecord(start);
    cudaMemcpy(d_data, h_pinned, BYTES, cudaMemcpyHostToDevice);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&ms, start, stop);
    printf("Pinned H2D:   %.2f ms (%.2f GB/s)\n", 
           ms, BYTES / ms / 1e6);
    
    // Cleanup
    free(h_pageable);
    cudaFreeHost(h_pinned);  // Must use cudaFreeHost!
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o pinned_memory pinned_memory.cu
!./pinned_memory

---

## Part 2: Async Memory Copies

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile async_memcpy.cu
// async_memcpy.cu - Asynchronous memory transfers
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void processKernel(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    
    for (int i = tid; i < n; i += stride) {
        // Simulate heavy computation
        float val = data[i];
        for (int j = 0; j < 100; j++) {
            val = sqrtf(val * val + 1.0f);
        }
        data[i] = val;
    }
}

int main() {
    const int N = 1 << 22;  // 4M elements
    const size_t BYTES = N * sizeof(float);
    
    // MUST use pinned memory for async!
    float *h_data;
    cudaMallocHost(&h_data, BYTES);
    
    float *d_data;
    cudaMalloc(&d_data, BYTES);
    
    // Initialize
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    // ============================================
    // Synchronous (Blocking)
    // ============================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    // These block until complete:
    cudaMemcpy(d_data, h_data, BYTES, cudaMemcpyHostToDevice);
    processKernel<<<256, 256>>>(d_data, N);
    cudaMemcpy(h_data, d_data, BYTES, cudaMemcpyDeviceToHost);
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float syncTime;
    cudaEventElapsedTime(&syncTime, start, stop);
    
    // ============================================
    // Asynchronous (Non-Blocking)
    // ============================================
    cudaEventRecord(start);
    
    // These return immediately to CPU:
    cudaMemcpyAsync(d_data, h_data, BYTES, 
                    cudaMemcpyHostToDevice, stream);
    processKernel<<<256, 256, 0, stream>>>(d_data, N);
    cudaMemcpyAsync(h_data, d_data, BYTES, 
                    cudaMemcpyDeviceToHost, stream);
    
    // Can do CPU work here while GPU is busy!
    
    cudaStreamSynchronize(stream);  // Wait when needed
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float asyncTime;
    cudaEventElapsedTime(&asyncTime, start, stop);
    
    printf("Sync time:  %.2f ms\n", syncTime);
    printf("Async time: %.2f ms\n", asyncTime);
    
    cudaStreamDestroy(stream);
    cudaFreeHost(h_data);
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o async_memcpy async_memcpy.cu
!./async_memcpy

---

## Part 3: The Overlap Pattern

### Chunked Processing for Overlap

```
Without Chunking (No Overlap):
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
H2D:    ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Compute:                                        ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
D2H:                                                                ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

With Chunking (Overlapped):
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
Stream0: [H2D][Compute][D2H]
Stream1:      [H2D][Compute][D2H]
Stream2:           [H2D][Compute][D2H]
Stream3:                [H2D][Compute][D2H]

Timeline:  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
           (H2D, Compute, and D2H overlap!)
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile overlap_pattern.cu
// overlap_pattern.cu - Overlapping transfers with computation
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        float val = data[tid];
        for (int i = 0; i < 500; i++) {
            val = sinf(val) * cosf(val) + 1.0f;
        }
        data[tid] = val;
    }
}

int main() {
    const int N = 1 << 24;  // 16M elements
    const int NUM_STREAMS = 4;
    const int CHUNK_SIZE = N / NUM_STREAMS;
    const size_t BYTES = N * sizeof(float);
    const size_t CHUNK_BYTES = CHUNK_SIZE * sizeof(float);
    
    // Pinned host memory
    float *h_input, *h_output;
    cudaMallocHost(&h_input, BYTES);
    cudaMallocHost(&h_output, BYTES);
    
    // Device memory (separate for each stream for true overlap)
    float* d_data[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaMalloc(&d_data[i], CHUNK_BYTES);
    }
    
    // Create streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamCreate(&streams[i]);
    }
    
    // Initialize input
    for (int i = 0; i < N; i++) h_input[i] = 1.0f;
    
    // ============================================
    // VERSION 1: No Overlap (Baseline)
    // ============================================
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * CHUNK_SIZE;
        
        // All in default stream - sequential!
        cudaMemcpy(d_data[i], h_input + offset, CHUNK_BYTES, 
                   cudaMemcpyHostToDevice);
        process<<<(CHUNK_SIZE+255)/256, 256>>>(d_data[i], CHUNK_SIZE);
        cudaMemcpy(h_output + offset, d_data[i], CHUNK_BYTES, 
                   cudaMemcpyDeviceToHost);
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float noOverlapTime;
    cudaEventElapsedTime(&noOverlapTime, start, stop);
    
    // ============================================
    // VERSION 2: With Overlap
    // ============================================
    cudaEventRecord(start);
    
    // Issue all operations for all streams
    for (int i = 0; i < NUM_STREAMS; i++) {
        int offset = i * CHUNK_SIZE;
        
        cudaMemcpyAsync(d_data[i], h_input + offset, CHUNK_BYTES,
                        cudaMemcpyHostToDevice, streams[i]);
        
        process<<<(CHUNK_SIZE+255)/256, 256, 0, streams[i]>>>(
            d_data[i], CHUNK_SIZE);
        
        cudaMemcpyAsync(h_output + offset, d_data[i], CHUNK_BYTES,
                        cudaMemcpyDeviceToHost, streams[i]);
    }
    
    // Synchronize all streams
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamSynchronize(streams[i]);
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float overlapTime;
    cudaEventElapsedTime(&overlapTime, start, stop);
    
    printf("Without overlap: %.2f ms\n", noOverlapTime);
    printf("With overlap:    %.2f ms\n", overlapTime);
    printf("Speedup:         %.2fx\n", noOverlapTime / overlapTime);
    
    // Cleanup
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamDestroy(streams[i]);
        cudaFree(d_data[i]);
    }
    cudaFreeHost(h_input);
    cudaFreeHost(h_output);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o overlap_pattern overlap_pattern.cu
!./overlap_pattern

### üî∂ Python/Numba (Optional - Quick Testing)

In [None]:
# Python/Numba Async Transfer Demo (OPTIONAL)

@cuda.jit
def process(data):
    tid = cuda.grid(1)
    if tid < data.shape[0]:
        val = data[tid]
        for _ in range(100):
            val = val * 1.001 + 0.001
        data[tid] = val

n = 1 << 22
num_streams = 4
chunk = n // num_streams

# Create streams
streams = [cuda.stream() for _ in range(num_streams)]

# Pinned host arrays
h_input = cuda.pinned_array(n, dtype=np.float32)
h_output = cuda.pinned_array(n, dtype=np.float32)
h_input[:] = 1.0

# Device arrays (one per stream)
d_chunks = [cuda.device_array(chunk, dtype=np.float32) for _ in range(num_streams)]

# Launch overlapped work
start = time.time()

for i in range(num_streams):
    offset = i * chunk
    
    # Async copy H2D
    d_chunks[i].copy_to_device(h_input[offset:offset+chunk], stream=streams[i])
    
    # Kernel
    process[(chunk+255)//256, 256, streams[i]](d_chunks[i])
    
    # Async copy D2H
    d_chunks[i].copy_to_host(h_output[offset:offset+chunk], stream=streams[i])

# Sync all
for s in streams:
    s.synchronize()

elapsed = time.time() - start
print(f"Overlapped time: {elapsed*1000:.2f} ms")

---

## Part 4: Profiling Overlap with Nsight Systems

```bash
# Profile to see overlap
nsys profile -o overlap_timeline ./overlap_pattern

# View in Nsight Systems GUI
nsys-ui overlap_timeline.nsys-rep
```

### What to Look For

```
Nsight Systems Timeline:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
                    Without Overlap          With Overlap
                    ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ        ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Copy Engine H2D:    ‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë        ‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë
Compute:            ‚ñë‚ñë‚ñë‚ñë‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë        ‚ñë‚ñë‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
Copy Engine D2H:    ‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñà‚ñà‚ñà‚ñà‚ñë‚ñë‚ñë‚ñë‚ñë        ‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñë‚ñà‚ñà‚ñà‚ñà
                    ‚Üë Sequential!            ‚Üë Overlapped!

Look for:
‚Ä¢ Parallel bars in different rows = overlap achieved
‚Ä¢ Gaps between operations = potential improvement
‚Ä¢ Copy/compute at same time = good utilization
```

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile overlap_exercises.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA Error: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
            exit(1); \
        } \
    } while(0)

__global__ void processKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float val = data[idx];
        for (int i = 0; i < 100; i++) {
            val = sinf(val) * cosf(val) + 0.1f;
        }
        data[idx] = val;
    }
}

// ============================================================
// Exercise 1: Pinned Memory Benchmark
// ============================================================

void exercise1_pinnedVsPageable() {
    printf("=== Exercise 1: Pinned vs Pageable Memory ===\n");
    printf("%-12s %-15s %-15s %-10s\n", "Size", "Pageable(ms)", "Pinned(ms)", "Speedup");
    printf("--------------------------------------------------\n");
    
    int sizes[] = {1<<20, 1<<22, 1<<24, 1<<26};  // 1MB to 64MB
    int numSizes = sizeof(sizes) / sizeof(sizes[0]);
    
    for (int s = 0; s < numSizes; s++) {
        int n = sizes[s];
        size_t bytes = n * sizeof(float);
        
        float *h_pageable = (float*)malloc(bytes);
        float *h_pinned;
        float *d_data;
        
        CHECK_CUDA(cudaMallocHost(&h_pinned, bytes));
        CHECK_CUDA(cudaMalloc(&d_data, bytes));
        
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        
        // Pageable transfer
        cudaEventRecord(start);
        for (int i = 0; i < 10; i++) {
            cudaMemcpy(d_data, h_pageable, bytes, cudaMemcpyHostToDevice);
            cudaMemcpy(h_pageable, d_data, bytes, cudaMemcpyDeviceToHost);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float pageableMs;
        cudaEventElapsedTime(&pageableMs, start, stop);
        
        // Pinned transfer
        cudaEventRecord(start);
        for (int i = 0; i < 10; i++) {
            cudaMemcpy(d_data, h_pinned, bytes, cudaMemcpyHostToDevice);
            cudaMemcpy(h_pinned, d_data, bytes, cudaMemcpyDeviceToHost);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float pinnedMs;
        cudaEventElapsedTime(&pinnedMs, start, stop);
        
        printf("%-12.1f MB %-15.2f %-15.2f %.2fx\n", 
               bytes / (1024.0f * 1024.0f), pageableMs, pinnedMs, pageableMs / pinnedMs);
        
        free(h_pageable);
        cudaFreeHost(h_pinned);
        cudaFree(d_data);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }
    printf("\n");
}

// ============================================================
// Exercise 2: Find Optimal Chunk Count
// ============================================================

void exercise2_optimalChunks() {
    printf("=== Exercise 2: Optimal Chunk Count ===\n");
    
    const int n = 1 << 24;  // 16M elements
    size_t bytes = n * sizeof(float);
    
    float *h_data;
    float *d_data;
    CHECK_CUDA(cudaMallocHost(&h_data, bytes));
    CHECK_CUDA(cudaMalloc(&d_data, bytes));
    
    for (int i = 0; i < n; i++) h_data[i] = 1.0f;
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int chunkCounts[] = {1, 2, 4, 8, 16, 32};
    int numTests = sizeof(chunkCounts) / sizeof(chunkCounts[0]);
    
    printf("%-10s %-15s\n", "Chunks", "Time (ms)");
    printf("---------------------------\n");
    
    float bestTime = 1e9;
    int bestChunks = 1;
    
    for (int t = 0; t < numTests; t++) {
        int numChunks = chunkCounts[t];
        int chunkSize = n / numChunks;
        size_t chunkBytes = chunkSize * sizeof(float);
        
        cudaStream_t* streams = (cudaStream_t*)malloc(numChunks * sizeof(cudaStream_t));
        for (int i = 0; i < numChunks; i++) {
            cudaStreamCreate(&streams[i]);
        }
        
        cudaEventRecord(start);
        for (int i = 0; i < numChunks; i++) {
            int offset = i * chunkSize;
            cudaMemcpyAsync(d_data + offset, h_data + offset, chunkBytes, 
                           cudaMemcpyHostToDevice, streams[i]);
            processKernel<<<(chunkSize+255)/256, 256, 0, streams[i]>>>(d_data + offset, chunkSize);
            cudaMemcpyAsync(h_data + offset, d_data + offset, chunkBytes,
                           cudaMemcpyDeviceToHost, streams[i]);
        }
        cudaDeviceSynchronize();
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        
        float ms;
        cudaEventElapsedTime(&ms, start, stop);
        printf("%-10d %-15.2f", numChunks, ms);
        
        if (ms < bestTime) {
            bestTime = ms;
            bestChunks = numChunks;
            printf(" *");
        }
        printf("\n");
        
        for (int i = 0; i < numChunks; i++) {
            cudaStreamDestroy(streams[i]);
        }
        free(streams);
    }
    
    printf("\nOptimal: %d chunks (%.2f ms)\n\n", bestChunks, bestTime);
    
    cudaFreeHost(h_data);
    cudaFree(d_data);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

// ============================================================
// Exercise 3: Double Buffering
// ============================================================

void exercise3_doubleBuffering() {
    printf("=== Exercise 3: Double Buffering ===\n");
    
    const int n = 1 << 24;  // 16M elements
    const int numChunks = 8;
    const int chunkSize = n / numChunks;
    size_t chunkBytes = chunkSize * sizeof(float);
    
    // Host memory
    float *h_input, *h_output;
    CHECK_CUDA(cudaMallocHost(&h_input, n * sizeof(float)));
    CHECK_CUDA(cudaMallocHost(&h_output, n * sizeof(float)));
    
    for (int i = 0; i < n; i++) h_input[i] = 1.0f;
    
    // Device memory - only 2 buffers for double buffering
    float *d_buffer[2];
    CHECK_CUDA(cudaMalloc(&d_buffer[0], chunkBytes));
    CHECK_CUDA(cudaMalloc(&d_buffer[1], chunkBytes));
    
    cudaStream_t streams[2];
    cudaStreamCreate(&streams[0]);
    cudaStreamCreate(&streams[1]);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    // Process first chunk
    cudaMemcpyAsync(d_buffer[0], h_input, chunkBytes, cudaMemcpyHostToDevice, streams[0]);
    
    for (int chunk = 0; chunk < numChunks; chunk++) {
        int curr = chunk % 2;
        int next = (chunk + 1) % 2;
        
        // Launch kernel on current chunk
        processKernel<<<(chunkSize+255)/256, 256, 0, streams[curr]>>>(d_buffer[curr], chunkSize);
        
        // Start transfer of next chunk (if not last)
        if (chunk < numChunks - 1) {
            int nextOffset = (chunk + 1) * chunkSize;
            cudaMemcpyAsync(d_buffer[next], h_input + nextOffset, chunkBytes,
                           cudaMemcpyHostToDevice, streams[next]);
        }
        
        // Transfer current result back
        int currOffset = chunk * chunkSize;
        cudaMemcpyAsync(h_output + currOffset, d_buffer[curr], chunkBytes,
                       cudaMemcpyDeviceToHost, streams[curr]);
    }
    
    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    printf("Double buffering with %d chunks: %.2f ms\n", numChunks, ms);
    printf("Device memory used: 2 buffers (%.1f MB each)\n", chunkBytes / (1024.0f * 1024.0f));
    printf("Total data processed: %.1f MB\n\n", n * sizeof(float) / (1024.0f * 1024.0f));
    
    cudaStreamDestroy(streams[0]);
    cudaStreamDestroy(streams[1]);
    cudaFree(d_buffer[0]);
    cudaFree(d_buffer[1]);
    cudaFreeHost(h_input);
    cudaFreeHost(h_output);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

int main() {
    printf("‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó\n");
    printf("‚ïë           Overlap Transfers Exercises                        ‚ïë\n");
    printf("‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù\n\n");
    
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("Device: %s\n", prop.name);
    printf("Async Engines: %d\n\n", prop.asyncEngineCount);
    
    exercise1_pinnedVsPageable();
    exercise2_optimalChunks();
    exercise3_doubleBuffering();
    
    printf("‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê\n");
    printf("                    All exercises completed!\n");
    printf("‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê\n");
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o overlap_exercises overlap_exercises.cu && ./overlap_exercises

### üî∂ Python/Numba Exercises (Optional)

### Exercise 1: Pinned Memory Benchmark
Compare transfer speeds for various sizes with pageable vs pinned memory.

### Exercise 2: Find Optimal Chunk Count
Test with 2, 4, 8, 16 streams and find the sweet spot for your GPU.

### Exercise 3: Double Buffering
```cpp
// Implement double buffering:
// - While processing chunk N, transfer chunk N+1
// - Use only 2 device buffers, alternate between them
```

---

## Key Takeaways

```
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ              OVERLAPPING TRANSFERS                      ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ                                                         ‚îÇ
‚îÇ  Pinned Memory:                                         ‚îÇ
‚îÇ  ‚Ä¢ cudaMallocHost() / cudaFreeHost()                    ‚îÇ
‚îÇ  ‚Ä¢ Required for async transfers                         ‚îÇ
‚îÇ  ‚Ä¢ Faster than pageable (no staging)                    ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Async Copies:                                          ‚îÇ
‚îÇ  ‚Ä¢ cudaMemcpyAsync(dst, src, size, kind, stream)        ‚îÇ
‚îÇ  ‚Ä¢ Returns immediately to CPU                           ‚îÇ
‚îÇ  ‚Ä¢ Operations in same stream execute in order           ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Overlap Pattern:                                       ‚îÇ
‚îÇ  ‚Ä¢ Split data into chunks                               ‚îÇ
‚îÇ  ‚Ä¢ Each chunk in different stream                       ‚îÇ
‚îÇ  ‚Ä¢ H2D, compute, D2H can all overlap                    ‚îÇ
‚îÇ                                                         ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
```

## Next: Day 3 - Multi-Stream Execution