## Why IPC?

GPU memory is normally **process-private**. But many scenarios need memory sharing:

```
┌─────────────────────────────────────────────────────────┐
│                    Use Cases for IPC                     │
├─────────────────────────────────────────────────────────┤
│ 1. Inference Server: Model in one process, requests     │
│    handled by worker processes                          │
│                                                         │
│ 2. Pipeline Processing: Each stage in separate process  │
│    (fault isolation, independent scaling)               │
│                                                         │
│ 3. MPI Applications: Processes on same node share data  │
│    via GPU memory instead of copying through host       │
│                                                         │
│ 4. Multi-Tenant: Different users share GPU resources    │
└─────────────────────────────────────────────────────────┘
```

## IPC API Overview

```cpp
// Producer process: Export memory handle
cudaIpcMemHandle_t handle;
cudaMalloc(&d_ptr, size);
cudaIpcGetMemHandle(&handle, d_ptr);  // Get shareable handle
// Send 'handle' to consumer (via file, socket, shared memory, etc.)

// Consumer process: Import memory handle  
cudaIpcMemHandle_t handle;  // Received from producer
void* d_ptr;
cudaIpcOpenMemHandle(&d_ptr, handle, cudaIpcMemLazyEnablePeerAccess);
// Now d_ptr points to SAME GPU memory as producer!

// When done
cudaIpcCloseMemHandle(d_ptr);  // Consumer closes handle
cudaFree(d_ptr);               // Producer frees memory
```

In [None]:
# Check GPU and CUDA version
!nvidia-smi --query-gpu=name,memory.total,compute_cap --format=csv
!nvcc --version | tail -1

## Example 1: Basic IPC with File-Based Handle Transfer

We'll create two programs:
1. **Producer**: Allocates memory, writes data, exports handle to file
2. **Consumer**: Reads handle from file, opens memory, reads data

In [None]:
%%writefile ipc_producer.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <unistd.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void initData(float* data, int n, float value) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = value + idx * 0.001f;
    }
}

int main() {
    const int N = 1024;
    const size_t size = N * sizeof(float);
    
    printf("=== IPC Producer ===\n");
    
    // Allocate GPU memory
    float* d_data;
    CHECK_CUDA(cudaMalloc(&d_data, size));
    printf("Allocated %zu bytes at %p\n", size, d_data);
    
    // Initialize data
    initData<<<(N+255)/256, 256>>>(d_data, N, 42.0f);
    CHECK_CUDA(cudaDeviceSynchronize());
    printf("Initialized data with pattern\n");
    
    // Get IPC handle
    cudaIpcMemHandle_t handle;
    CHECK_CUDA(cudaIpcGetMemHandle(&handle, d_data));
    printf("Got IPC handle (64 bytes)\n");
    
    // Write handle to file
    FILE* f = fopen("/tmp/cuda_ipc_handle.bin", "wb");
    if (!f) { perror("fopen"); exit(1); }
    fwrite(&handle, sizeof(handle), 1, f);
    fclose(f);
    printf("Wrote handle to /tmp/cuda_ipc_handle.bin\n");
    
    // Signal ready and wait
    FILE* ready = fopen("/tmp/cuda_ipc_ready", "w");
    fclose(ready);
    printf("Signaled ready. Waiting for consumer...\n");
    printf("(Run ipc_consumer in another terminal, or press Ctrl+C to exit)\n");
    
    // Wait for consumer to finish
    while (access("/tmp/cuda_ipc_done", F_OK) != 0) {
        sleep(1);
    }
    
    printf("Consumer finished. Cleaning up...\n");
    
    // Cleanup
    CHECK_CUDA(cudaFree(d_data));
    remove("/tmp/cuda_ipc_handle.bin");
    remove("/tmp/cuda_ipc_ready");
    remove("/tmp/cuda_ipc_done");
    
    printf("Done!\n");
    return 0;
}

In [None]:
%%writefile ipc_consumer.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <unistd.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void verifyData(float* data, int n, float expected_base, int* errors) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float expected = expected_base + idx * 0.001f;
        if (fabsf(data[idx] - expected) > 0.0001f) {
            atomicAdd(errors, 1);
        }
    }
}

int main() {
    const int N = 1024;
    
    printf("=== IPC Consumer ===\n");
    
    // Wait for producer
    printf("Waiting for producer...\n");
    while (access("/tmp/cuda_ipc_ready", F_OK) != 0) {
        sleep(1);
    }
    printf("Producer ready!\n");
    
    // Read handle from file
    cudaIpcMemHandle_t handle;
    FILE* f = fopen("/tmp/cuda_ipc_handle.bin", "rb");
    if (!f) { perror("fopen"); exit(1); }
    fread(&handle, sizeof(handle), 1, f);
    fclose(f);
    printf("Read IPC handle from file\n");
    
    // Open the shared memory
    float* d_data;
    CHECK_CUDA(cudaIpcOpenMemHandle((void**)&d_data, handle, 
                                     cudaIpcMemLazyEnablePeerAccess));
    printf("Opened shared memory at %p\n", d_data);
    
    // Verify data
    int* d_errors;
    CHECK_CUDA(cudaMalloc(&d_errors, sizeof(int)));
    CHECK_CUDA(cudaMemset(d_errors, 0, sizeof(int)));
    
    verifyData<<<(N+255)/256, 256>>>(d_data, N, 42.0f, d_errors);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    int h_errors;
    CHECK_CUDA(cudaMemcpy(&h_errors, d_errors, sizeof(int), cudaMemcpyDeviceToHost));
    
    if (h_errors == 0) {
        printf("✓ Data verification PASSED! Shared memory works!\n");
    } else {
        printf("✗ Data verification FAILED: %d errors\n", h_errors);
    }
    
    // Read first few values
    float h_data[5];
    CHECK_CUDA(cudaMemcpy(h_data, d_data, 5*sizeof(float), cudaMemcpyDeviceToHost));
    printf("First 5 values: %.3f, %.3f, %.3f, %.3f, %.3f\n",
           h_data[0], h_data[1], h_data[2], h_data[3], h_data[4]);
    
    // Cleanup
    CHECK_CUDA(cudaFree(d_errors));
    CHECK_CUDA(cudaIpcCloseMemHandle(d_data));
    
    // Signal done
    FILE* done = fopen("/tmp/cuda_ipc_done", "w");
    fclose(done);
    
    printf("Done!\n");
    return 0;
}

In [None]:
# Compile both programs
!nvcc -O3 -arch=sm_80 ipc_producer.cu -o ipc_producer
!nvcc -O3 -arch=sm_80 ipc_consumer.cu -o ipc_consumer
print("Compiled successfully!")

## Running the IPC Example

On HPC cluster, open **two terminals**:

**Terminal 1 (same GPU node):**
```bash
srun --partition=h100flex --gres=gpu:1 --time=00:10:00 --pty bash
cd cuda-lab/learning-path/week-17
./ipc_producer
```

**Terminal 2 (same GPU node - use srun with --jobid):**
```bash
srun --jobid=<JOBID> --pty bash
cd cuda-lab/learning-path/week-17
./ipc_consumer
```

## Example 2: Single-Process IPC Demo

For easier testing, here's a fork-based version that runs in one process:

In [None]:
%%writefile ipc_fork_demo.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/mman.h>
#include <fcntl.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("[PID %d] CUDA error: %s\n", getpid(), cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void producer_init(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = 100.0f + idx;
    }
}

__global__ void consumer_transform(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] *= 2.0f;  // Consumer modifies producer's data!
    }
}

struct SharedData {
    cudaIpcMemHandle_t handle;
    volatile int ready;
    volatile int done;
};

int main() {
    const int N = 1024;
    const size_t size = N * sizeof(float);
    
    printf("=== IPC Fork Demo ===\n\n");
    
    // Create shared memory for communication
    SharedData* shared = (SharedData*)mmap(NULL, sizeof(SharedData),
        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    shared->ready = 0;
    shared->done = 0;
    
    pid_t pid = fork();
    
    if (pid == 0) {
        // ========== CHILD (Consumer) ==========
        printf("[Consumer PID %d] Started\n", getpid());
        
        // Wait for producer
        while (!shared->ready) { usleep(1000); }
        printf("[Consumer] Producer signaled ready\n");
        
        // Open shared GPU memory
        float* d_data;
        CHECK_CUDA(cudaIpcOpenMemHandle((void**)&d_data, shared->handle,
                                         cudaIpcMemLazyEnablePeerAccess));
        printf("[Consumer] Opened shared memory\n");
        
        // Transform the data (modifies producer's memory!)
        consumer_transform<<<(N+255)/256, 256>>>(d_data, N);
        CHECK_CUDA(cudaDeviceSynchronize());
        printf("[Consumer] Transformed data (multiplied by 2)\n");
        
        // Close handle
        CHECK_CUDA(cudaIpcCloseMemHandle(d_data));
        
        shared->done = 1;
        printf("[Consumer] Done!\n");
        exit(0);
        
    } else {
        // ========== PARENT (Producer) ==========
        printf("[Producer PID %d] Started\n", getpid());
        
        // Allocate and initialize
        float* d_data;
        CHECK_CUDA(cudaMalloc(&d_data, size));
        producer_init<<<(N+255)/256, 256>>>(d_data, N);
        CHECK_CUDA(cudaDeviceSynchronize());
        printf("[Producer] Initialized data: [100, 101, 102, ...]\n");
        
        // Get IPC handle
        CHECK_CUDA(cudaIpcGetMemHandle(&shared->handle, d_data));
        printf("[Producer] Created IPC handle\n");
        
        // Signal consumer
        shared->ready = 1;
        printf("[Producer] Signaled ready, waiting for consumer...\n");
        
        // Wait for consumer
        while (!shared->done) { usleep(1000); }
        
        // Verify consumer's transformation
        float h_data[5];
        CHECK_CUDA(cudaMemcpy(h_data, d_data, 5*sizeof(float), cudaMemcpyDeviceToHost));
        printf("[Producer] After consumer transform: [%.0f, %.0f, %.0f, %.0f, %.0f]\n",
               h_data[0], h_data[1], h_data[2], h_data[3], h_data[4]);
        printf("[Producer] Expected: [200, 202, 204, 206, 208] ✓\n");
        
        // Cleanup
        CHECK_CUDA(cudaFree(d_data));
        wait(NULL);  // Wait for child
        munmap(shared, sizeof(SharedData));
        
        printf("\n=== IPC Demo Complete! ===\n");
    }
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 ipc_fork_demo.cu -o ipc_fork_demo && ./ipc_fork_demo

## IPC Requirements and Limitations

```
┌─────────────────────────────────────────────────────────┐
│                    IPC Requirements                      │
├─────────────────────────────────────────────────────────┤
│ ✓ Same GPU (or peer-accessible GPUs)                    │
│ ✓ Same machine (no network IPC)                         │
│ ✓ Memory allocated with cudaMalloc (not cudaMallocHost) │
│ ✓ Producer must keep memory allocated until done        │
│ ✓ 64-bit OS required                                    │
├─────────────────────────────────────────────────────────┤
│                      Limitations                         │
├─────────────────────────────────────────────────────────┤
│ ✗ Cannot share cudaMallocManaged() memory               │
│ ✗ Cannot share across different GPU architectures       │
│ ✗ Handle is only valid on same machine                  │
│ ✗ Maximum open handles per process (OS dependent)       │
└─────────────────────────────────────────────────────────┘
```

## Event IPC: Synchronization Across Processes

CUDA also supports sharing **events** for cross-process synchronization:

In [None]:
%%writefile ipc_events_demo.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/mman.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void slowKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        // Simulate work
        float val = data[idx];
        for (int i = 0; i < 1000; i++) {
            val = sinf(val) + cosf(val);
        }
        data[idx] = val;
    }
}

struct SharedData {
    cudaIpcMemHandle_t memHandle;
    cudaIpcEventHandle_t eventHandle;
    volatile int ready;
};

int main() {
    const int N = 1024 * 1024;
    
    printf("=== IPC Events Demo ===\n\n");
    
    SharedData* shared = (SharedData*)mmap(NULL, sizeof(SharedData),
        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    shared->ready = 0;
    
    pid_t pid = fork();
    
    if (pid == 0) {
        // ========== CHILD (Waiter) ==========
        while (!shared->ready) usleep(1000);
        
        // Open event from handle
        cudaEvent_t event;
        CHECK_CUDA(cudaIpcOpenEventHandle(&event, shared->eventHandle));
        
        printf("[Child] Waiting for producer's kernel to complete...\n");
        auto start = std::chrono::high_resolution_clock::now();
        
        // Wait for event (blocks until producer's kernel is done)
        CHECK_CUDA(cudaEventSynchronize(event));
        
        auto end = std::chrono::high_resolution_clock::now();
        float ms = std::chrono::duration<float, std::milli>(end - start).count();
        printf("[Child] Event signaled! Waited %.2f ms\n", ms);
        
        exit(0);
        
    } else {
        // ========== PARENT (Producer) ==========
        float* d_data;
        CHECK_CUDA(cudaMalloc(&d_data, N * sizeof(float)));
        
        // Create IPC-capable event
        cudaEvent_t event;
        CHECK_CUDA(cudaEventCreate(&event, cudaEventInterprocess | cudaEventDisableTiming));
        
        // Get handles
        CHECK_CUDA(cudaIpcGetMemHandle(&shared->memHandle, d_data));
        CHECK_CUDA(cudaIpcGetEventHandle(&shared->eventHandle, event));
        
        shared->ready = 1;
        printf("[Parent] Starting slow kernel...\n");
        
        // Launch kernel and record event when done
        slowKernel<<<(N+255)/256, 256>>>(d_data, N);
        CHECK_CUDA(cudaEventRecord(event));
        
        printf("[Parent] Kernel launched, doing other work...\n");
        usleep(100000);  // Simulate other work
        
        CHECK_CUDA(cudaEventSynchronize(event));
        printf("[Parent] Kernel complete\n");
        
        wait(NULL);
        CHECK_CUDA(cudaEventDestroy(event));
        CHECK_CUDA(cudaFree(d_data));
        munmap(shared, sizeof(SharedData));
        
        printf("\n=== Event IPC Complete! ===\n");
    }
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 -std=c++14 ipc_events_demo.cu -o ipc_events_demo && ./ipc_events_demo

## Summary

| Function | Purpose |
|----------|--------|
| `cudaIpcGetMemHandle()` | Export GPU memory to shareable handle |
| `cudaIpcOpenMemHandle()` | Import handle to access shared memory |
| `cudaIpcCloseMemHandle()` | Release imported handle |
| `cudaIpcGetEventHandle()` | Export event for cross-process sync |
| `cudaIpcOpenEventHandle()` | Import event handle |

**Key Points:**
1. IPC enables zero-copy memory sharing between processes
2. Both memory and events can be shared
3. Processes must be on the same machine and GPU
4. Handle transfer mechanism is user's responsibility (file, socket, shared memory)

## Exercises

1. Modify the producer to write a recognizable pattern (e.g., Fibonacci sequence)
2. Implement a ring buffer using IPC for streaming data between processes
3. Add error handling for the case where consumer opens handle before producer allocates