## What is MIG?

Multi-Instance GPU (MIG) partitions a single GPU into multiple isolated instances:

```
┌─────────────────────────────────────────────────┐
│                 Full H100 GPU                   │
│  80GB HBM3  |  132 SMs  |  Full Memory BW       │
└─────────────────────────────────────────────────┘
                      │
            MIG Partitioning
                      │
          ┌───────────┴───────────┐
          ▼                       ▼
┌─────────────────┐     ┌─────────────────┐
│   MIG 3g.40gb   │     │   MIG 3g.40gb   │
│   40GB | 66 SMs │     │   40GB | 66 SMs │
└─────────────────┘     └─────────────────┘
     Instance 0              Instance 1
```

**Key benefits:**
- Hardware-level isolation (memory, SMs, cache)
- Quality of Service (QoS) guarantees
- Multi-tenant GPU sharing
- Right-sizing GPU for workloads

## Querying MIG Support

In [None]:
%%writefile mig_query.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <nvml.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

#define CHECK_NVML(call) \
    do { \
        nvmlReturn_t result = call; \
        if (result != NVML_SUCCESS) { \
            printf("NVML error: %s\n", nvmlErrorString(result)); \
        } \
    } while(0)

void printMIGCapabilities(int device) {
    cudaDeviceProp prop;
    CHECK_CUDA(cudaGetDeviceProperties(&prop, device));
    
    printf("\n=== MIG Capabilities for %s ===\n", prop.name);
    
    // Check architecture
    int major = prop.major, minor = prop.minor;
    printf("Compute Capability: %d.%d\n", major, minor);
    
    // MIG requires Ampere (8.0) or newer
    if (major < 8) {
        printf("MIG NOT SUPPORTED - Requires Ampere (8.0) or newer\n");
        return;
    }
    
    // Check specific GPU models
    const char* name = prop.name;
    bool isMIGCapable = false;
    
    // Known MIG-capable GPUs
    if (strstr(name, "A100") || strstr(name, "H100") || 
        strstr(name, "H200") || strstr(name, "B100") ||
        strstr(name, "B200") || strstr(name, "A30")) {
        isMIGCapable = true;
    }
    
    printf("GPU: %s\n", name);
    printf("MIG Capable: %s\n", isMIGCapable ? "YES" : "NO (consumer GPU)");
    
    if (!isMIGCapable) {
        printf("  Note: Only datacenter GPUs (A100, H100, etc.) support MIG\n");
        return;
    }
    
    // Query MIG mode via NVML
    nvmlInit();
    nvmlDevice_t nvmlDevice;
    CHECK_NVML(nvmlDeviceGetHandleByIndex(device, &nvmlDevice));
    
    unsigned int currentMode, pendingMode;
    nvmlReturn_t migResult = nvmlDeviceGetMigMode(nvmlDevice, &currentMode, &pendingMode);
    
    if (migResult == NVML_SUCCESS) {
        printf("\nMIG Mode:\n");
        printf("  Current: %s\n", currentMode ? "ENABLED" : "DISABLED");
        printf("  Pending: %s\n", pendingMode ? "ENABLED" : "DISABLED");
        
        if (currentMode) {
            // List GPU instances
            unsigned int count;
            nvmlGpuInstance_t gpuInstances[8];
            nvmlReturn_t giResult = nvmlDeviceGetGpuInstances(nvmlDevice, 
                                                               NVML_GPU_INSTANCE_PROFILE_1_SLICE,
                                                               gpuInstances, &count);
            printf("  GPU Instances found: %u\n", count);
        }
    } else if (migResult == NVML_ERROR_NOT_SUPPORTED) {
        printf("\nMIG query not supported on this configuration\n");
    }
    
    // Memory info
    printf("\nGPU Resources:\n");
    printf("  Total Memory: %.2f GB\n", prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
    printf("  SMs: %d\n", prop.multiProcessorCount);
    printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
    
    nvmlShutdown();
}

int main() {
    printf("=== MIG Query Tool ===\n");
    
    int deviceCount;
    CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
    printf("Found %d CUDA device(s)\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        printMIGCapabilities(i);
    }
    
    return 0;
}

In [None]:
# Compile with NVML library
!nvcc -O3 -arch=sm_80 mig_query.cu -o mig_query -lnvidia-ml && ./mig_query

## nvidia-smi MIG Commands

Check MIG status using command line (typically requires admin access to modify):

In [None]:
# Query GPU info including MIG mode
!nvidia-smi --query-gpu=name,mig.mode.current,mig.mode.pending --format=csv

In [None]:
# List available MIG profiles (if on MIG-capable GPU)
!nvidia-smi mig -lgip 2>/dev/null || echo "MIG not available or not enabled"

In [None]:
# List existing GPU instances
!nvidia-smi mig -lgi 2>/dev/null || echo "No GPU instances or MIG not enabled"

## MIG Instance Profiles

H100 SXM (80GB) supports these profiles:

| Profile | GPCs | Memory | Use Case |
|---------|------|--------|----------|
| 1g.10gb | 1/7 | 10GB | Small inference |
| 2g.20gb | 2/7 | 20GB | Medium workloads |
| 3g.40gb | 3/7 | 40GB | Training |
| 4g.40gb | 4/7 | 40GB | Large training |
| 7g.80gb | 7/7 | 80GB | Full GPU (no isolation) |

A100 (80GB) profiles:

| Profile | GPCs | Memory |
|---------|------|--------|
| 1g.10gb | 1/7 | 10GB |
| 2g.20gb | 2/7 | 20GB |
| 3g.40gb | 3/7 | 40GB |
| 7g.80gb | 7/7 | 80GB |

## Detecting MIG Instance in Code

In [None]:
%%writefile detect_mig_instance.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

int main() {
    printf("=== MIG Instance Detection ===\n\n");
    
    // Check CUDA_VISIBLE_DEVICES
    const char* visibleDevices = getenv("CUDA_VISIBLE_DEVICES");
    printf("CUDA_VISIBLE_DEVICES: %s\n", visibleDevices ? visibleDevices : "(not set)");
    
    // Check for MIG UUID format
    if (visibleDevices && strstr(visibleDevices, "MIG-")) {
        printf("Running on MIG instance!\n");
        printf("MIG UUID: %s\n", visibleDevices);
    }
    
    int deviceCount;
    CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
    printf("\nVisible devices: %d\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        CHECK_CUDA(cudaGetDeviceProperties(&prop, i));
        
        printf("\nDevice %d: %s\n", i, prop.name);
        printf("  UUID: ");
        for (int j = 0; j < 16; j++) {
            printf("%02x", (unsigned char)prop.uuid.bytes[j]);
            if (j == 3 || j == 5 || j == 7 || j == 9) printf("-");
        }
        printf("\n");
        
        printf("  Total Memory: %.2f GB\n", 
               prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
        printf("  SMs: %d\n", prop.multiProcessorCount);
        printf("  Compute: %d.%d\n", prop.major, prop.minor);
        
        // Detect if this looks like a MIG partition
        // MIG instances have fewer SMs than the full GPU
        // H100: 132 SMs full, A100: 108 SMs full
        bool likelyMIG = false;
        if (strstr(prop.name, "H100") && prop.multiProcessorCount < 100) {
            likelyMIG = true;
        } else if (strstr(prop.name, "A100") && prop.multiProcessorCount < 100) {
            likelyMIG = true;
        }
        
        if (likelyMIG) {
            printf("  >>> Appears to be a MIG partition (reduced SM count)\n");
        }
    }
    
    // Run a simple kernel to verify
    printf("\nRunning verification kernel...\n");
    int* d_data;
    CHECK_CUDA(cudaMalloc(&d_data, sizeof(int)));
    CHECK_CUDA(cudaMemset(d_data, 0, sizeof(int)));
    CHECK_CUDA(cudaFree(d_data));
    printf("Kernel execution successful!\n");
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 detect_mig_instance.cu -o detect_mig_instance && ./detect_mig_instance

## MIG Architecture Deep Dive

```
┌──────────────────────────────────────────────────────────────────┐
│                          H100 Full GPU                           │
├──────────────────────────────────────────────────────────────────┤
│  GPC 0  │  GPC 1  │  GPC 2  │  GPC 3  │  GPC 4  │  GPC 5  │ GPC 6│
│ (16 SM) │ (16 SM) │ (16 SM) │ (16 SM) │ (16 SM) │ (16 SM) │(20SM)│
├──────────────────────────────────────────────────────────────────┤
│                    L2 Cache (50 MB)                              │
├──────────────────────────────────────────────────────────────────┤
│  HBM3 Stack 0  │  Stack 1  │  Stack 2  │  Stack 3  │  Stack 4   │
│    (16 GB)     │  (16 GB)  │  (16 GB)  │  (16 GB)  │  (16 GB)   │
└──────────────────────────────────────────────────────────────────┘

MIG creates isolated "slices" with:
- Dedicated GPCs (Graphics Processing Clusters)
- Dedicated memory bandwidth
- Isolated L2 cache partition
- Separate error isolation domain
```

**Key isolation properties:**
1. **Memory isolation** - Each instance has private memory
2. **SM isolation** - Dedicated compute resources
3. **Bandwidth isolation** - QoS guaranteed memory BW
4. **Error isolation** - Faults don't affect other instances

## Benchmark on MIG Instance

In [None]:
%%writefile mig_benchmark.cu
#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

__global__ void computeKernel(float* data, int n, int iterations) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float val = data[idx];
        for (int i = 0; i < iterations; i++) {
            val = sinf(val) * cosf(val) + 0.1f;
        }
        data[idx] = val;
    }
}

int main() {
    printf("=== MIG Instance Benchmark ===\n\n");
    
    cudaDeviceProp prop;
    CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));
    
    printf("GPU: %s\n", prop.name);
    printf("SMs: %d\n", prop.multiProcessorCount);
    printf("Memory: %.2f GB\n\n", prop.totalGlobalMem / (1024.0*1024.0*1024.0));
    
    // Use available memory (scale to instance size)
    size_t freeMemory, totalMemory;
    CHECK_CUDA(cudaMemGetInfo(&freeMemory, &totalMemory));
    
    // Use 50% of available memory
    size_t dataSize = freeMemory / 2;
    int n = dataSize / sizeof(float);
    
    printf("Allocating %.2f GB (%.2f GB available)\n", 
           dataSize / (1024.0*1024.0*1024.0),
           freeMemory / (1024.0*1024.0*1024.0));
    
    float* d_data;
    CHECK_CUDA(cudaMalloc(&d_data, dataSize));
    CHECK_CUDA(cudaMemset(d_data, 0, dataSize));
    
    // Benchmark compute throughput
    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;
    int iterations = 100;
    
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaEventCreate(&start));
    CHECK_CUDA(cudaEventCreate(&stop));
    
    // Warmup
    computeKernel<<<numBlocks, blockSize>>>(d_data, n, iterations);
    CHECK_CUDA(cudaDeviceSynchronize());
    
    // Benchmark
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < 10; i++) {
        computeKernel<<<numBlocks, blockSize>>>(d_data, n, iterations);
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    
    float ms;
    CHECK_CUDA(cudaEventElapsedTime(&ms, start, stop));
    
    double throughput = (double)n * 10 * iterations / (ms / 1000.0) / 1e9;
    printf("\nCompute: %.2f ms for 10 iterations\n", ms);
    printf("Throughput: %.2f billion ops/sec\n", throughput);
    printf("Per-SM throughput: %.2f billion ops/sec/SM\n", 
           throughput / prop.multiProcessorCount);
    
    // Memory bandwidth test
    CHECK_CUDA(cudaEventRecord(start));
    for (int i = 0; i < 10; i++) {
        CHECK_CUDA(cudaMemset(d_data, i, dataSize));
    }
    CHECK_CUDA(cudaEventRecord(stop));
    CHECK_CUDA(cudaEventSynchronize(stop));
    
    CHECK_CUDA(cudaEventElapsedTime(&ms, start, stop));
    double bw = (double)dataSize * 10 / (ms / 1000.0) / 1e9;
    printf("\nMemory Bandwidth: %.2f GB/s\n", bw);
    
    CHECK_CUDA(cudaFree(d_data));
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 mig_benchmark.cu -o mig_benchmark && ./mig_benchmark

## Key Takeaways

1. **MIG provides hardware isolation** - Not just time-slicing
2. **Available on datacenter GPUs** - A100, H100, B100, B200
3. **Check CUDA_VISIBLE_DEVICES** - MIG instances have MIG-UUID format
4. **SM count indicates partition size** - Fewer SMs = smaller MIG instance
5. **Memory is proportionally allocated** - 1g.10gb gets ~10GB