## Part 1: CUDA C++ (Primary)

### GPU Check

In [None]:
!nvcc --version
!nvidia-smi --query-gpu=name,memory.total --format=csv

### What are Bank Conflicts?

Shared memory is divided into **32 banks** (one per warp lane). When multiple threads in a warp access the same bank simultaneously, accesses are **serialized** - this is a bank conflict.

```
SHARED MEMORY BANKS (32 banks, 4-byte words)
┌──────┬──────┬──────┬──────┬─────┬──────┐
│Bank 0│Bank 1│Bank 2│Bank 3│ ... │Bank31│
├──────┼──────┼──────┼──────┼─────┼──────┤
│  [0] │  [1] │  [2] │  [3] │ ... │ [31] │  ← Row 0
│ [32] │ [33] │ [34] │ [35] │ ... │ [63] │  ← Row 1
│ [64] │ [65] │ [66] │ [67] │ ... │ [95] │  ← Row 2
│  ... │  ... │  ... │  ... │ ... │  ... │
└──────┴──────┴──────┴──────┴─────┴──────┘

Address i is in Bank (i % 32)
```

**No conflict:** Each thread accesses a different bank  
**2-way conflict:** 2 threads access same bank → 2x slower  
**32-way conflict:** All threads hit same bank → 32x slower!

In [None]:
%%writefile bank_conflicts.cu
/**
 * Bank Conflicts Demonstration
 * 
 * Shows the performance impact of different access patterns.
 */

#include <stdio.h>
#include <cuda_runtime.h>

#define BLOCK_SIZE 32
#define N (1 << 20)  // 1M elements
#define ITERATIONS 1000

// No bank conflict: stride = 1
__global__ void noConflict(float *out) {
    __shared__ float smem[BLOCK_SIZE * BLOCK_SIZE];
    
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;
    
    // Stride-1 access: thread i accesses bank i
    smem[tid] = (float)tid;
    __syncthreads();
    
    // Read back
    out[idx] = smem[tid];
}

// 2-way bank conflict: stride = 2
__global__ void twoWayConflict(float *out) {
    __shared__ float smem[BLOCK_SIZE * BLOCK_SIZE];
    
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;
    
    // Stride-2: threads 0,16 hit bank 0; threads 1,17 hit bank 2; etc.
    smem[tid * 2] = (float)tid;
    __syncthreads();
    
    out[idx] = smem[tid * 2];
}

// 32-way bank conflict: stride = 32
__global__ void fullConflict(float *out) {
    __shared__ float smem[BLOCK_SIZE * BLOCK_SIZE];
    
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;
    
    // Stride-32: ALL threads hit bank 0!
    smem[tid * 32 % 1024] = (float)tid;
    __syncthreads();
    
    out[idx] = smem[tid * 32 % 1024];
}

float benchmark(void (*kernel)(float*), float *d_out, int n) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    int blocks = n / BLOCK_SIZE;
    
    // Warmup
    kernel<<<blocks, BLOCK_SIZE>>>(d_out);
    cudaDeviceSynchronize();
    
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        kernel<<<blocks, BLOCK_SIZE>>>(d_out);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return ms / ITERATIONS;
}

int main() {
    printf("=== Bank Conflict Benchmark ===\n\n");
    
    float *d_out;
    cudaMalloc(&d_out, N * sizeof(float));
    
    float no_conflict_time = benchmark(noConflict, d_out, N);
    float two_way_time = benchmark(twoWayConflict, d_out, N);
    float full_conflict_time = benchmark(fullConflict, d_out, N);
    
    printf("✅ No Conflict (stride=1):    %.4f ms\n", no_conflict_time);
    printf("⚠️  2-Way Conflict (stride=2): %.4f ms (%.1fx slower)\n", 
           two_way_time, two_way_time / no_conflict_time);
    printf("❌ 32-Way Conflict (stride=32): %.4f ms (%.1fx slower)\n", 
           full_conflict_time, full_conflict_time / no_conflict_time);
    
    cudaFree(d_out);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -O3 bank_conflicts.cu -o bank_conflicts && ./bank_conflicts

### Padding to Avoid Bank Conflicts

A common technique: add padding to change the bank mapping.

```cpp
// Problem: 32x32 array, column access has 32-way conflict
__shared__ float smem[32][32];
smem[threadIdx.x][0];  // All threads hit bank 0!

// Solution: Add 1 column of padding
__shared__ float smem[32][33];  // 33 columns!
smem[threadIdx.x][0];  // Now threads hit different banks
```

**Visual: How +1 Padding Fixes Bank Conflicts**

```
WITHOUT PADDING: __shared__ float tile[32][32];
─────────────────────────────────────────────────
Memory layout (linear): tile[row][col] → index = row * 32 + col

Column 0 access: tile[0][0], tile[1][0], tile[2][0], ...
                    │           │           │
                    ↓           ↓           ↓
Index:             0          32          64         ...
Bank (idx % 32):   0           0           0         ...  ← ALL BANK 0! ❌

Thread 0 → Bank 0
Thread 1 → Bank 0  ← 32-way conflict!
Thread 2 → Bank 0
   ...

WITH PADDING: __shared__ float tile[32][33];
─────────────────────────────────────────────────
Memory layout (linear): tile[row][col] → index = row * 33 + col

Column 0 access: tile[0][0], tile[1][0], tile[2][0], ...
                    │           │           │
                    ↓           ↓           ↓
Index:             0          33          66         ...
Bank (idx % 32):   0           1           2         ...  ← DIFFERENT BANKS! ✅

Thread 0 → Bank 0
Thread 1 → Bank 1  ← No conflict!
Thread 2 → Bank 2
   ...
```

**Why it works:** Adding 1 to row width (32 → 33) shifts each row by 1 bank relative to the previous row.

In [None]:
%%writefile transpose_no_conflicts.cu
/**
 * Matrix Transpose WITHOUT Bank Conflicts
 * 
 * Uses padding to eliminate bank conflicts during shared memory access.
 */

#include <stdio.h>
#include <cuda_runtime.h>

#define TILE_DIM 32
#define WIDTH 4096
#define HEIGHT 4096
#define ITERATIONS 100

// Shared memory WITH bank conflicts
__global__ void transposeWithConflicts(float *out, float *in, int width, int height) {
    __shared__ float tile[TILE_DIM][TILE_DIM];  // Bank conflicts on column read!
    
    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;
    
    if (x < width && y < height) {
        tile[threadIdx.y][threadIdx.x] = in[y * width + x];
    }
    
    __syncthreads();
    
    x = blockIdx.y * TILE_DIM + threadIdx.x;
    y = blockIdx.x * TILE_DIM + threadIdx.y;
    
    if (x < height && y < width) {
        // Reading tile[threadIdx.x][...] causes 32-way bank conflict!
        out[y * height + x] = tile[threadIdx.x][threadIdx.y];
    }
}

// Shared memory WITHOUT bank conflicts (padding)
__global__ void transposeNoBankConflicts(float *out, float *in, int width, int height) {
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];  // +1 padding!
    
    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;
    
    if (x < width && y < height) {
        tile[threadIdx.y][threadIdx.x] = in[y * width + x];
    }
    
    __syncthreads();
    
    x = blockIdx.y * TILE_DIM + threadIdx.x;
    y = blockIdx.x * TILE_DIM + threadIdx.y;
    
    if (x < height && y < width) {
        // No bank conflict due to +1 padding!
        out[y * height + x] = tile[threadIdx.x][threadIdx.y];
    }
}

int main() {
    printf("=== Bank Conflict-Free Transpose ===\n");
    printf("Matrix: %d x %d\n\n", WIDTH, HEIGHT);
    
    size_t size = WIDTH * HEIGHT * sizeof(float);
    
    float *d_in, *d_out;
    cudaMalloc(&d_in, size);
    cudaMalloc(&d_out, size);
    
    dim3 block(TILE_DIM, TILE_DIM);
    dim3 grid((WIDTH + TILE_DIM - 1) / TILE_DIM, (HEIGHT + TILE_DIM - 1) / TILE_DIM);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // With conflicts
    transposeWithConflicts<<<grid, block>>>(d_out, d_in, WIDTH, HEIGHT);
    cudaDeviceSynchronize();
    
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        transposeWithConflicts<<<grid, block>>>(d_out, d_in, WIDTH, HEIGHT);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float with_time;
    cudaEventElapsedTime(&with_time, start, stop);
    with_time /= ITERATIONS;
    
    // Without conflicts
    transposeNoBankConflicts<<<grid, block>>>(d_out, d_in, WIDTH, HEIGHT);
    cudaDeviceSynchronize();
    
    cudaEventRecord(start);
    for (int i = 0; i < ITERATIONS; i++) {
        transposeNoBankConflicts<<<grid, block>>>(d_out, d_in, WIDTH, HEIGHT);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float without_time;
    cudaEventElapsedTime(&without_time, start, stop);
    without_time /= ITERATIONS;
    
    float with_bw = 2.0f * size / (with_time * 1e6);
    float without_bw = 2.0f * size / (without_time * 1e6);
    
    printf("⚠️  With Bank Conflicts:    %.3f ms, %.2f GB/s\n", with_time, with_bw);
    printf("✅ No Bank Conflicts (+1):  %.3f ms, %.2f GB/s\n", without_time, without_bw);
    printf("\nSpeedup from removing conflicts: %.2fx\n", with_time / without_time);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -O3 transpose_no_conflicts.cu -o transpose_no_conflicts && ./transpose_no_conflicts

### Bank Conflict Rules

| Access Pattern | Conflict Type | Solution |
|---------------|---------------|----------|
| `smem[tid]` | None | ✅ Good |
| `smem[tid * 2]` | 2-way | Rearrange or pad |
| `smem[tid * 32]` | 32-way | Add +1 padding |
| `smem[const]` | Broadcast | ✅ No conflict (special case) |
| `smem[row][col]` with row=32 | Varies | Use `smem[row][col+1]` |

**Exception:** When ALL threads read the SAME address, it's a **broadcast** - no conflict.

### Visualizing Bank Access

In [None]:
%%writefile visualize_banks.cu
/**
 * Visualize which bank each thread accesses
 */

#include <stdio.h>

void showBankAccess(const char* name, int stride) {
    printf("\n%s (stride=%d):\n", name, stride);
    printf("Thread → Address → Bank\n");
    printf("------------------------\n");
    
    int bank_count[32] = {0};
    
    for (int tid = 0; tid < 32; tid++) {
        int addr = tid * stride;
        int bank = addr % 32;
        bank_count[bank]++;
        if (tid < 8) {  // Show first 8
            printf("  T%2d  →  [%4d]  → Bank %2d\n", tid, addr, bank);
        }
    }
    printf("  ...\n");
    
    // Find max conflicts
    int max_conflict = 0;
    for (int b = 0; b < 32; b++) {
        if (bank_count[b] > max_conflict) max_conflict = bank_count[b];
    }
    
    printf("\nConflict level: %d-way\n", max_conflict);
}

int main() {
    printf("=== Bank Access Visualization ===\n");
    
    showBankAccess("✅ No conflict", 1);
    showBankAccess("⚠️  2-way conflict", 2);
    showBankAccess("❌ 32-way conflict", 32);
    showBankAccess("✅ With +1 padding", 33);  // 32+1
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 visualize_banks.cu -o visualize_banks && ./visualize_banks

### Bank Conflicts Summary

**Key Points:**
1. Shared memory has 32 banks (address % 32)
2. Bank conflicts serialize accesses within a warp
3. Stride-32 access = maximum conflict
4. Solution: Add +1 padding to 2D arrays

**Optimization checklist:**
- [ ] Check for power-of-2 strides
- [ ] Add padding to 2D shared arrays: `[N][N+1]`
- [ ] Consider access patterns when designing kernels

---

## Part 2: Python/Numba (Optional)

In [None]:
# Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

from numba import cuda
import numpy as np

print(f"CUDA Device: {cuda.get_current_device().name.decode()}")

In [None]:
# Bank conflict demonstration in Numba
# Note: Numba uses the same underlying shared memory, same rules apply

@cuda.jit
def with_conflicts(out):
    smem = cuda.shared.array((32, 32), dtype=np.float32)  # 32-way conflict on column access
    tx = cuda.threadIdx.x
    
    smem[tx, 0] = float(tx)  # All threads hit bank 0
    cuda.syncthreads()
    out[cuda.blockIdx.x * 32 + tx] = smem[tx, 0]

@cuda.jit 
def no_conflicts(out):
    smem = cuda.shared.array((32, 33), dtype=np.float32)  # +1 padding!
    tx = cuda.threadIdx.x
    
    smem[tx, 0] = float(tx)  # Now threads hit different banks
    cuda.syncthreads()
    out[cuda.blockIdx.x * 32 + tx] = smem[tx, 0]

# The same padding technique applies in Numba
print("Bank conflict avoidance in Numba:")
print("  With conflicts:    smem = cuda.shared.array((32, 32), ...)")
print("  No conflicts:      smem = cuda.shared.array((32, 33), ...)")
print("\nSame +1 padding technique works!")