## Error Handling Patterns

In [None]:
%%writefile error_handling.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <stdarg.h>
#include <time.h>

// =============================================================
// Production Error Handling Framework
// =============================================================

enum LogLevel {
    LOG_DEBUG = 0,
    LOG_INFO = 1,
    LOG_WARN = 2,
    LOG_ERROR = 3,
    LOG_FATAL = 4
};

const char* logLevelStr[] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL"};

// Thread-safe logging
void logMessage(LogLevel level, const char* file, int line, const char* fmt, ...) {
    time_t now = time(NULL);
    struct tm* tm_info = localtime(&now);
    char timestamp[26];
    strftime(timestamp, 26, "%Y-%m-%d %H:%M:%S", tm_info);
    
    fprintf(stderr, "[%s] [%s] %s:%d - ", timestamp, logLevelStr[level], file, line);
    
    va_list args;
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);
    
    fprintf(stderr, "\n");
}

#define LOG_DEBUG(...) logMessage(LOG_DEBUG, __FILE__, __LINE__, __VA_ARGS__)
#define LOG_INFO(...)  logMessage(LOG_INFO,  __FILE__, __LINE__, __VA_ARGS__)
#define LOG_WARN(...)  logMessage(LOG_WARN,  __FILE__, __LINE__, __VA_ARGS__)
#define LOG_ERROR(...) logMessage(LOG_ERROR, __FILE__, __LINE__, __VA_ARGS__)
#define LOG_FATAL(...) logMessage(LOG_FATAL, __FILE__, __LINE__, __VA_ARGS__)

// Enhanced CUDA error check with context
#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = (call); \
        if (err != cudaSuccess) { \
            LOG_ERROR("CUDA error in %s: %s (code %d)", \
                      #call, cudaGetErrorString(err), err); \
            return err; \
        } \
    } while(0)

#define CUDA_CHECK_FATAL(call) \
    do { \
        cudaError_t err = (call); \
        if (err != cudaSuccess) { \
            LOG_FATAL("CUDA error in %s: %s", #call, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

// Check for kernel launch errors (async)
#define CUDA_KERNEL_CHECK() \
    do { \
        cudaError_t err = cudaGetLastError(); \
        if (err != cudaSuccess) { \
            LOG_ERROR("Kernel launch error: %s", cudaGetErrorString(err)); \
            return err; \
        } \
        err = cudaDeviceSynchronize(); \
        if (err != cudaSuccess) { \
            LOG_ERROR("Kernel execution error: %s", cudaGetErrorString(err)); \
            return err; \
        } \
    } while(0)

// =============================================================
// Example: Robust Memory Allocation
// =============================================================

cudaError_t safeDeviceMalloc(void** ptr, size_t size, const char* name) {
    LOG_DEBUG("Allocating %zu bytes for '%s'", size, name);
    
    // Check available memory first
    size_t freeMemory, totalMemory;
    CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
    
    if (size > freeMemory) {
        LOG_ERROR("Insufficient GPU memory for '%s': need %zu, have %zu",
                  name, size, freeMemory);
        return cudaErrorMemoryAllocation;
    }
    
    // Attempt allocation
    CUDA_CHECK(cudaMalloc(ptr, size));
    
    LOG_INFO("Allocated %zu bytes for '%s' (%.1f%% of free memory)",
             size, name, 100.0 * size / freeMemory);
    
    return cudaSuccess;
}

// =============================================================
// Example: Error Recovery
// =============================================================

__global__ void faultyKernel(int* data, int n, bool causeFault) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (causeFault && idx == 0) {
        // Intentional out-of-bounds access
        int* bad_ptr = (int*)0xDEADBEEF;
        *bad_ptr = 42;
    }
    if (idx < n) {
        data[idx] = idx * 2;
    }
}

cudaError_t runWithRecovery(int* d_data, int n, bool causeFault) {
    LOG_INFO("Launching kernel (causeFault=%d)", causeFault);
    
    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    
    faultyKernel<<<numBlocks, blockSize>>>(d_data, n, causeFault);
    
    cudaError_t launchErr = cudaGetLastError();
    if (launchErr != cudaSuccess) {
        LOG_ERROR("Kernel launch failed: %s", cudaGetErrorString(launchErr));
        return launchErr;
    }
    
    cudaError_t syncErr = cudaDeviceSynchronize();
    if (syncErr != cudaSuccess) {
        LOG_ERROR("Kernel execution failed: %s", cudaGetErrorString(syncErr));
        
        // Attempt recovery
        LOG_WARN("Attempting device reset...");
        cudaError_t resetErr = cudaDeviceReset();
        if (resetErr != cudaSuccess) {
            LOG_FATAL("Device reset failed: %s", cudaGetErrorString(resetErr));
            return resetErr;
        }
        
        LOG_INFO("Device reset successful");
        return syncErr;
    }
    
    LOG_INFO("Kernel completed successfully");
    return cudaSuccess;
}

// =============================================================
// Main Demo
// =============================================================

int main() {
    LOG_INFO("=== Error Handling Demo ===");
    
    const int N = 1024;
    int* d_data = nullptr;
    
    // Safe allocation
    cudaError_t err = safeDeviceMalloc((void**)&d_data, N * sizeof(int), "test_data");
    if (err != cudaSuccess) {
        LOG_FATAL("Allocation failed");
        return 1;
    }
    
    // Test successful execution
    LOG_INFO("--- Test 1: Normal execution ---");
    err = runWithRecovery(d_data, N, false);
    if (err == cudaSuccess) {
        LOG_INFO("Test 1 PASSED");
    }
    
    // Clean up
    CUDA_CHECK_FATAL(cudaFree(d_data));
    
    LOG_INFO("=== Demo Complete ===");
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 error_handling.cu -o error_handling && ./error_handling

## Sticky Error Handling

In [None]:
%%writefile sticky_errors.cu
#include <stdio.h>
#include <cuda_runtime.h>

// Demonstrate sticky vs non-sticky errors
// Sticky errors require device reset to clear

int main() {
    printf("=== Sticky Error Demonstration ===\n\n");
    
    int deviceCount;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    printf("Initial state: %s\n\n", cudaGetErrorString(err));
    
    // Non-sticky error: Invalid configuration
    printf("--- Non-Sticky Error Test ---\n");
    int* dummy;
    err = cudaMalloc(&dummy, (size_t)-1);  // Impossibly large allocation
    printf("After bad malloc: %s\n", cudaGetErrorString(err));
    
    // Check last error (should be set)
    err = cudaGetLastError();
    printf("cudaGetLastError(): %s\n", cudaGetErrorString(err));
    
    // Check peek error (same but doesn't clear)
    err = cudaPeekAtLastError();
    printf("cudaPeekAtLastError(): %s\n", cudaGetErrorString(err));
    
    // After cudaGetLastError, error should be cleared
    err = cudaGetLastError();
    printf("After clearing: %s\n", cudaGetErrorString(err));
    
    // Try normal operation (should work)
    int* d_ptr;
    err = cudaMalloc(&d_ptr, sizeof(int));
    printf("Normal allocation after non-sticky: %s\n", cudaGetErrorString(err));
    
    if (err == cudaSuccess) {
        cudaFree(d_ptr);
        printf("Device still functional!\n\n");
    }
    
    // Sticky errors (like illegal memory access) require device reset
    printf("--- Sticky Error Categories ---\n");
    printf("Non-sticky (recoverable):\n");
    printf("  - cudaErrorInvalidValue\n");
    printf("  - cudaErrorMemoryAllocation\n");
    printf("  - cudaErrorInvalidDevicePointer\n");
    printf("\nSticky (require device reset):\n");
    printf("  - cudaErrorIllegalAddress (illegal memory access)\n");
    printf("  - cudaErrorHardwareStackError\n");
    printf("  - cudaErrorAssert (device-side assert)\n");
    printf("  - cudaErrorLaunchTimeout\n");
    printf("  - ECC memory errors\n");
    
    printf("\n--- Error Type Checking ---\n");
    
    // Example: Check if error is recoverable
    auto isRecoverable = [](cudaError_t e) {
        switch(e) {
            case cudaErrorInvalidValue:
            case cudaErrorMemoryAllocation:
            case cudaErrorInvalidConfiguration:
                return true;
            default:
                return false;
        }
    };
    
    cudaError_t testErrors[] = {
        cudaErrorInvalidValue,
        cudaErrorMemoryAllocation,
        cudaErrorIllegalAddress,
        cudaErrorAssert
    };
    
    for (auto e : testErrors) {
        printf("%s: %s\n", 
               cudaGetErrorName(e),
               isRecoverable(e) ? "Recoverable" : "Requires reset");
    }
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 sticky_errors.cu -o sticky_errors && ./sticky_errors

## Stream-Based Error Checking

In [None]:
%%writefile stream_errors.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void simpleKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = sqrtf((float)idx);
    }
}

int main() {
    printf("=== Stream-Based Error Checking ===\n\n");
    
    const int N = 1024 * 1024;
    const int numStreams = 4;
    
    cudaStream_t streams[numStreams];
    float* d_data[numStreams];
    cudaError_t streamErrors[numStreams];
    
    // Create streams and allocate per-stream memory
    for (int i = 0; i < numStreams; i++) {
        cudaStreamCreate(&streams[i]);
        cudaMalloc(&d_data[i], N * sizeof(float));
        streamErrors[i] = cudaSuccess;
    }
    
    // Launch kernels on different streams
    printf("Launching kernels on %d streams...\n", numStreams);
    
    for (int i = 0; i < numStreams; i++) {
        int blockSize = 256;
        int numBlocks = (N + blockSize - 1) / blockSize;
        
        simpleKernel<<<numBlocks, blockSize, 0, streams[i]>>>(d_data[i], N);
        
        // Check launch error immediately (doesn't wait for completion)
        streamErrors[i] = cudaGetLastError();
        if (streamErrors[i] != cudaSuccess) {
            printf("Stream %d launch error: %s\n", i, 
                   cudaGetErrorString(streamErrors[i]));
        }
    }
    
    // Query stream status (non-blocking)
    printf("\nQuerying stream status...\n");
    for (int i = 0; i < numStreams; i++) {
        cudaError_t status = cudaStreamQuery(streams[i]);
        printf("Stream %d: %s\n", i, 
               status == cudaSuccess ? "Complete" :
               status == cudaErrorNotReady ? "Running" :
               cudaGetErrorString(status));
    }
    
    // Synchronize each stream and check for errors
    printf("\nSynchronizing streams...\n");
    for (int i = 0; i < numStreams; i++) {
        cudaError_t err = cudaStreamSynchronize(streams[i]);
        if (err != cudaSuccess) {
            printf("Stream %d execution error: %s\n", i, 
                   cudaGetErrorString(err));
            streamErrors[i] = err;
        } else {
            printf("Stream %d: Success\n", i);
        }
    }
    
    // Summary
    printf("\n--- Error Summary ---\n");
    int errorCount = 0;
    for (int i = 0; i < numStreams; i++) {
        if (streamErrors[i] != cudaSuccess) {
            errorCount++;
            printf("Stream %d FAILED: %s\n", i, 
                   cudaGetErrorString(streamErrors[i]));
        }
    }
    
    if (errorCount == 0) {
        printf("All %d streams completed successfully!\n", numStreams);
    } else {
        printf("%d/%d streams had errors\n", errorCount, numStreams);
    }
    
    // Cleanup
    for (int i = 0; i < numStreams; i++) {
        cudaFree(d_data[i]);
        cudaStreamDestroy(streams[i]);
    }
    
    return errorCount;
}

In [None]:
!nvcc -O3 -arch=sm_80 stream_errors.cu -o stream_errors && ./stream_errors

## CUDA Debugging Environment Variables

```bash
# Enable synchronous launches (helps locate errors)
export CUDA_LAUNCH_BLOCKING=1

# Memory checking
export CUDA_MEMCHECK_PATCH_MODULE=1

# Device selection
export CUDA_VISIBLE_DEVICES=0

# Debug output
export CUDA_DEVICE_ORDER=PCI_BUS_ID

# For compute-sanitizer
compute-sanitizer --tool memcheck ./my_program
compute-sanitizer --tool racecheck ./my_program
compute-sanitizer --tool initcheck ./my_program
compute-sanitizer --tool synccheck ./my_program
```

## Key Takeaways

1. **Always check errors** - Both launch and execution errors
2. **Use cudaGetLastError()** - Clears non-sticky errors
3. **Sticky errors require reset** - cudaDeviceReset()
4. **Stream errors are isolated** - One stream's error doesn't affect others
5. **Logging is essential** - Include timestamps and context
6. **Use compute-sanitizer** - For memory and race condition checking