In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda
import time

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Concurrent Kernel Execution

### When Can Kernels Run Concurrently?

```
Requirements for Concurrent Kernels:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1. Different streams
2. Enough GPU resources (SMs, registers, shared memory)
3. No dependencies between kernels
4. Device supports concurrent kernels (check capability)

┌─────────────────────────────────────────────────────────┐
│                      GPU SMs                            │
├─────────────┬─────────────┬─────────────┬──────────────┤
│    SM 0     │    SM 1     │    SM 2     │    SM 3      │
├─────────────┼─────────────┼─────────────┼──────────────┤
│ Kernel A    │ Kernel A    │ Kernel B    │ Kernel B     │
│ blocks      │ blocks      │ blocks      │ blocks       │
└─────────────┴─────────────┴─────────────┴──────────────┘
         ↑ Multiple kernels share the GPU!
```

### CUDA C++ Concurrent Kernels (Primary)

```cpp
// concurrent_kernels.cu - Running multiple kernels at once
#include <stdio.h>
#include <cuda_runtime.h>

// Small kernel - uses few resources
__global__ void smallKernel(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        for (int i = 0; i < 1000; i++) {
            data[tid] = sinf(data[tid]);
        }
    }
}

int main() {
    const int N = 1 << 16;  // Small size so kernels fit together
    const int NUM_STREAMS = 4;
    
    // Check concurrent kernel support
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("Concurrent kernels: %s\n", 
           prop.concurrentKernels ? "Yes" : "No");
    printf("Max concurrent kernels: ~%d (estimate)\n",
           prop.multiProcessorCount);  // Rough estimate
    
    // Allocate
    float* d_data[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaMalloc(&d_data[i], N * sizeof(float));
    }
    
    // Create streams
    cudaStream_t streams[NUM_STREAMS];
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamCreate(&streams[i]);
    }
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // ============================================
    // Sequential Execution (single stream)
    // ============================================
    cudaEventRecord(start);
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        // All in default stream
        smallKernel<<<64, 256>>>(d_data[i], N);
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float seqTime;
    cudaEventElapsedTime(&seqTime, start, stop);
    
    // ============================================
    // Concurrent Execution (multiple streams)
    // ============================================
    cudaEventRecord(start);
    
    for (int i = 0; i < NUM_STREAMS; i++) {
        // Each in its own stream - can run concurrently!
        smallKernel<<<64, 256, 0, streams[i]>>>(d_data[i], N);
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float concTime;
    cudaEventElapsedTime(&concTime, start, stop);
    
    printf("\nResults:\n");
    printf("Sequential: %.2f ms\n", seqTime);
    printf("Concurrent: %.2f ms\n", concTime);
    printf("Speedup:    %.2fx\n", seqTime / concTime);
    
    // Cleanup
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaStreamDestroy(streams[i]);
        cudaFree(d_data[i]);
    }
    
    return 0;
}
```

---

## Part 2: Stream Priorities

### CUDA C++ Stream Priorities (Primary)

```cpp
// stream_priorities.cu - Prioritizing work
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void work(float* data, int n, const char* name) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        for (int i = 0; i < 1000; i++) {
            data[tid] = sqrtf(data[tid] + 1.0f);
        }
    }
}

int main() {
    // ============================================
    // Query Priority Range
    // ============================================
    int lowPriority, highPriority;
    cudaDeviceGetStreamPriorityRange(&lowPriority, &highPriority);
    
    printf("Priority range: %d (low/default) to %d (high)\n",
           lowPriority, highPriority);
    // Note: LOWER number = HIGHER priority!
    
    // ============================================
    // Create Prioritized Streams
    // ============================================
    cudaStream_t highPriorityStream, lowPriorityStream;
    
    cudaStreamCreateWithPriority(&highPriorityStream, 
                                  cudaStreamNonBlocking, 
                                  highPriority);
    
    cudaStreamCreateWithPriority(&lowPriorityStream, 
                                  cudaStreamNonBlocking, 
                                  lowPriority);
    
    const int N = 1 << 16;
    float *d_high, *d_low;
    cudaMalloc(&d_high, N * sizeof(float));
    cudaMalloc(&d_low, N * sizeof(float));
    
    // Launch many low-priority kernels
    for (int i = 0; i < 10; i++) {
        work<<<64, 256, 0, lowPriorityStream>>>(d_low, N, "LOW");
    }
    
    // Launch high-priority kernel (should get resources first)
    work<<<64, 256, 0, highPriorityStream>>>(d_high, N, "HIGH");
    
    cudaDeviceSynchronize();
    
    printf("High priority work completed!\n");
    
    cudaStreamDestroy(highPriorityStream);
    cudaStreamDestroy(lowPriorityStream);
    cudaFree(d_high);
    cudaFree(d_low);
    
    return 0;
}
```

### Priority Use Cases

```
When to Use Stream Priorities:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

1. Interactive Applications
   └─ High: UI rendering, user input processing
   └─ Low: Background computation, prefetching

2. Real-time Systems  
   └─ High: Critical path operations
   └─ Low: Optional/deferrable work

3. Multi-tenant GPU
   └─ High: Latency-sensitive workloads
   └─ Low: Throughput-oriented batch jobs

Limitations:
• Priority only affects scheduling at SM level
• Doesn't preempt running kernels
• Effect depends on resource availability
```

---

## Part 3: Designing Multi-Stream Applications

### Pattern: Producer-Consumer Streams

```cpp
// producer_consumer.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void produce(float* data, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] = (float)tid;  // Generate data
    }
}

__global__ void consume(float* data, float* result, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        result[tid] = data[tid] * 2.0f;  // Process data
    }
}

int main() {
    const int N = 1 << 20;
    const int NUM_CHUNKS = 4;
    const int CHUNK = N / NUM_CHUNKS;
    
    float *d_data, *d_result;
    cudaMalloc(&d_data, N * sizeof(float));
    cudaMalloc(&d_result, N * sizeof(float));
    
    cudaStream_t producerStream, consumerStream;
    cudaStreamCreate(&producerStream);
    cudaStreamCreate(&consumerStream);
    
    cudaEvent_t chunkReady[NUM_CHUNKS];
    for (int i = 0; i < NUM_CHUNKS; i++) {
        cudaEventCreate(&chunkReady[i]);
    }
    
    // Pipeline: Producer creates data, consumer processes it
    for (int i = 0; i < NUM_CHUNKS; i++) {
        // Producer generates chunk i
        produce<<<(CHUNK+255)/256, 256, 0, producerStream>>>(
            d_data + i*CHUNK, CHUNK);
        
        // Mark chunk as ready
        cudaEventRecord(chunkReady[i], producerStream);
        
        // Consumer waits for chunk, then processes
        cudaStreamWaitEvent(consumerStream, chunkReady[i]);
        consume<<<(CHUNK+255)/256, 256, 0, consumerStream>>>(
            d_data + i*CHUNK, d_result + i*CHUNK, CHUNK);
    }
    
    cudaDeviceSynchronize();
    printf("Producer-consumer pipeline complete!\n");
    
    // Cleanup
    for (int i = 0; i < NUM_CHUNKS; i++) {
        cudaEventDestroy(chunkReady[i]);
    }
    cudaStreamDestroy(producerStream);
    cudaStreamDestroy(consumerStream);
    cudaFree(d_data);
    cudaFree(d_result);
    
    return 0;
}
```

In [None]:
# Python/Numba Multi-Stream Demo (OPTIONAL)

@cuda.jit
def kernel_a(data):
    tid = cuda.grid(1)
    if tid < data.shape[0]:
        for _ in range(500):
            data[tid] = data[tid] * 1.0001

@cuda.jit
def kernel_b(data):
    tid = cuda.grid(1)
    if tid < data.shape[0]:
        for _ in range(500):
            data[tid] = data[tid] + 0.0001

n = 1 << 18
num_streams = 4

# Create streams and data
streams = [cuda.stream() for _ in range(num_streams)]
d_arrays = [cuda.device_array(n, dtype=np.float32) for _ in range(num_streams)]

# Sequential (baseline)
start = time.time()
for i in range(num_streams):
    kernel_a[(n+255)//256, 256](d_arrays[i])
cuda.synchronize()
seq_time = time.time() - start

# Concurrent (multi-stream)
start = time.time()
for i in range(num_streams):
    kernel_a[(n+255)//256, 256, streams[i]](d_arrays[i])
for s in streams:
    s.synchronize()
conc_time = time.time() - start

print(f"Sequential: {seq_time*1000:.2f} ms")
print(f"Concurrent: {conc_time*1000:.2f} ms")
print(f"Speedup: {seq_time/conc_time:.2f}x")

---

## Exercises

### Exercise 1: Find Concurrency Limit
Experimentally determine how many small kernels can run concurrently on your GPU.

### Exercise 2: Priority Scheduling
Create a scenario where high-priority work completes noticeably faster.

### Exercise 3: Multi-Stage Pipeline
```cpp
// Implement a 3-stage pipeline:
// Stage 1: Load and preprocess
// Stage 2: Main computation
// Stage 3: Post-process and store
```

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│            MULTI-STREAM EXECUTION                       │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Concurrent Kernels:                                    │
│  • Different streams can run simultaneously             │
│  • Limited by GPU resources (SMs, memory)               │
│  • Small kernels benefit most                           │
│                                                         │
│  Stream Priorities:                                     │
│  • cudaDeviceGetStreamPriorityRange()                   │
│  • cudaStreamCreateWithPriority()                       │
│  • Lower number = higher priority                       │
│                                                         │
│  Design Patterns:                                       │
│  • Producer-consumer with events                        │
│  • Pipeline stages in different streams                 │
│  • Workload splitting for concurrency                   │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 4 - Events & Synchronization