# CPU Kernels 2D Basics - Understanding Image Processing Operations
This notebook demonstrates how image operations work pixel by pixel.

Each operation inside the nested loops is a "kernel" - the core computation that will later be parallelised on the GPU.

In 2D, we have rows (y-axis) and columns (x-axis).

On GPU: **col = threadIdx.x**, **row = threadIdx.y**

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## Create Test Image

In [None]:
# create a small test image with a gradient pattern
rows, cols = 6, 8
test_image = np.zeros((rows, cols), dtype=np.float32)

# fill with gradient values
for r in range(rows):
    for c in range(cols):
        test_image[r, c] = r * 10 + c

print(f"Test image ({rows}×{cols}):")
print(test_image)

fig, ax = plt.subplots()
im = ax.matshow(test_image, cmap='gray', vmin=0, vmax=255)
plt.colorbar(im, ax=ax, fraction=0.046)

## Pixel-wise Operations
### Method 1: Using NumPy
NumPy hides the nested loops.

In [None]:
# numpy does this operation on every pixel automatically
numpy_result = test_image * 2 + 5
print(f"Result (NumPy):")
print(numpy_result)

fig, ax = plt.subplots()
im = ax.matshow(numpy_result, cmap='gray', vmin=0, vmax=255)
plt.colorbar(im, ax=ax, fraction=0.046)

### Method 2: Explicit nested loops
This demonstrates the concept of a 2D kernel.

In [None]:
def kernel_transform_2d(input_array, scale, offset):
    """Transform each pixel: output = input * scale + offset."""
    num_rows, num_cols = input_array.shape
    output_array = np.zeros((num_rows, num_cols), dtype=np.float32)
    
    # these nested loops will become 2D thread indices on gpu
    for row in range(num_rows):  # gpu: threadIdx.y + blockIdx.y * blockDim.y
        for col in range(num_cols):  # gpu: threadIdx.x + blockIdx.x * blockDim.x
            # the kernel: what each thread does independently
            output_array[row, col] = input_array[row, col] * scale + offset
    
    return output_array

In [None]:
loop_result = kernel_transform_2d(test_image, scale=2, offset=5)
print(f"Result (explicit loops):")
print(loop_result)

# verify both methods give same result
print(f"\nResults match: {np.allclose(numpy_result, loop_result)}")

## Different Image Operations
Brightness adjustment, contrast, and thresholding.

In [None]:
def kernel_brightness_2d(input_array, brightness):
    """Add brightness to each pixel with clamping."""
    num_rows, num_cols = input_array.shape
    output_array = np.zeros((num_rows, num_cols), dtype=np.float32)
    
    for row in range(num_rows):
        for col in range(num_cols):
            # kernel: brightness adjustment with clamping
            new_value = input_array[row, col] + brightness
            # clamp to valid range [0, 255]
            if new_value < 0:
                new_value = 0
            elif new_value > 255:
                new_value = 255
            output_array[row, col] = new_value
    
    return output_array

def kernel_contrast_2d(input_array, factor):
    """Adjust contrast by scaling around middle value."""
    num_rows, num_cols = input_array.shape
    output_array = np.zeros((num_rows, num_cols), dtype=np.float32)
    middle = 128.0  # assuming 8-bit image range
    
    for row in range(num_rows):
        for col in range(num_cols):
            # kernel: contrast adjustment
            output_array[row, col] = (input_array[row, col] - middle) * factor + middle
    
    return output_array

def kernel_threshold_2d(input_array, threshold):
    """Binary threshold - set to 255 if above threshold, 0 otherwise."""
    num_rows, num_cols = input_array.shape
    output_array = np.zeros((num_rows, num_cols), dtype=np.float32)
    
    for row in range(num_rows):
        for col in range(num_cols):
            # kernel: threshold operation
            if input_array[row, col] > threshold:
                output_array[row, col] = 255.0
            else:
                output_array[row, col] = 0.0
    
    return output_array

In [None]:
# create a larger test image for better visualisation
rng = np.random.default_rng()
larger_image = rng.integers(low=0, high=100, size=(8, 10)).astype(np.float32)

# apply different kernels
brightened = kernel_brightness_2d(larger_image, 50)
contrasted = kernel_contrast_2d(larger_image, 1.5)
thresholded = kernel_threshold_2d(larger_image, 50)

# visualise results
fig, ax = plt.subplots(2, 2, figsize=(10, 8))

im1 = ax[0, 0].matshow(larger_image, cmap='gray', vmin=0, vmax=255)
ax[0, 0].set_title('Original')
ax[0, 0].axis('off')
plt.colorbar(im1, ax=ax[0, 0], fraction=0.046)

im2 = ax[0, 1].matshow(brightened, cmap='gray', vmin=0, vmax=255)
ax[0, 1].set_title('Brightness +50')
ax[0, 1].axis('off')
plt.colorbar(im2, ax=ax[0, 1], fraction=0.046)

im3 = ax[1, 0].matshow(contrasted, cmap='gray', vmin=0, vmax=255)
ax[1, 0].set_title('Contrast ×1.5')
ax[1, 0].axis('off')
plt.colorbar(im3, ax=ax[1, 0], fraction=0.046)

im4 = ax[1, 1].matshow(thresholded, cmap='gray', vmin=0, vmax=255)
ax[1, 1].set_title('Threshold > 50')
ax[1, 1].axis('off')
plt.colorbar(im4, ax=ax[1, 1], fraction=0.046)

plt.tight_layout()
plt.show()

## Independence of Pixel Operations
This demonstrates that each pixel is processed independently, i.e. _embarrassingly parallel_.

In [None]:
def demonstrate_pixel_independence(input_array):
    """
    Show that each pixel is processed independently.
    In GPU terms, each thread only needs its own pixel value.
    """
    # work with a small 3×3 region for clarity
    demo_rows, demo_cols = 3, 3
    small_region = input_array[:demo_rows, :demo_cols].copy()
    output_region = np.zeros((demo_rows, demo_cols), dtype=np.float32)
    
    print("Processing each pixel independently:")
    print(f"Input region:\n{small_region.astype(int)}\n")
    
    # simulate parallel processing - each "thread" handles one pixel
    thread_id = 0
    for row in range(demo_rows):
        for col in range(demo_cols):
            # each thread only accesses its assigned pixel
            input_value = small_region[row, col]
            output_value = input_value * 2 + 10
            output_region[row, col] = output_value
            
            print(f"Thread {thread_id} at ({row},{col}): "
                  f"{input_value:.0f} -> {output_value:.0f}")
            thread_id += 1
    
    print(f"\nOutput region:\n{output_region.astype(int)}")
    print("\nNote: All 9 threads could run simultaneously on GPU!")
    
    return output_region

In [None]:
demonstrate_pixel_independence(test_image)

## Visualise Thread Assignment in 2D
How a GPU would assign threads to pixels.

In [None]:
def visualise_2d_thread_blocks(image_shape=(16, 20), block_shape=(4, 4)):
    """Visualise how threads are organised in blocks for 2D processing."""
    rows, cols = image_shape
    block_rows, block_cols = block_shape
    
    # calculate number of blocks needed
    grid_rows = (rows + block_rows - 1) // block_rows
    grid_cols = (cols + block_cols - 1) // block_cols
    
    # create colour map for different blocks
    num_blocks = grid_rows * grid_cols
    colours = plt.cm.tab20(np.linspace(0, 1, num_blocks))
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # draw each thread block
    block_id = 0
    for block_row in range(grid_rows):
        for block_col in range(grid_cols):
            # calculate block boundaries
            start_row = block_row * block_rows
            end_row = min(start_row + block_rows, rows)
            start_col = block_col * block_cols
            end_col = min(start_col + block_cols, cols)
            
            # draw threads in this block
            for r in range(start_row, end_row):
                for c in range(start_col, end_col):
                    rect = plt.Rectangle((c, rows-r-1), 1, 1,
                                       facecolor=colours[block_id],
                                       edgecolor='black',
                                       linewidth=0.5)
                    ax.add_patch(rect)
                    
                    # add thread indices
                    thread_y = r - start_row
                    thread_x = c - start_col
                    ax.text(c + 0.5, rows-r-0.5, f"{thread_y},{thread_x}",
                           ha='center', va='center', fontsize=8)
            
            # label the block
            block_center_x = (start_col + end_col) / 2
            block_center_y = rows - (start_row + end_row) / 2
            ax.text(block_center_x, block_center_y + 0.3,
                   f"Block({block_row},{block_col})",
                   ha='center', va='center', fontsize=10,
                   weight='bold',
                   bbox=dict(boxstyle='round,pad=0.3',
                           facecolor='white', alpha=0.8))
            
            block_id += 1
    
    ax.set_xlim(0, cols)
    ax.set_ylim(0, rows)
    ax.set_aspect('equal')
    ax.set_xlabel('Column (x)')
    ax.set_ylabel('Row (y)')
    ax.set_title(f'2D Thread Block Organisation\n'
                f'Image: {rows}×{cols}, Block size: {block_rows}×{block_cols}, '
                f'Grid: {grid_rows}×{grid_cols} blocks')
    
    # add grid lines
    for i in range(rows + 1):
        ax.axhline(i, color='gray', linewidth=0.5, alpha=0.3)
    for i in range(cols + 1):
        ax.axvline(i, color='gray', linewidth=0.5, alpha=0.3)

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    plt.tight_layout()
    plt.show()
    
    print(f"Total pixels: {rows * cols}")
    print(f"Threads per block: {block_rows * block_cols}")
    print(f"Total blocks: {grid_rows * grid_cols}")
    print(f"Total threads: {grid_rows * grid_cols * block_rows * block_cols}")

In [None]:
# visualise how threads would be organised for a small image
visualise_2d_thread_blocks(image_shape=(8, 10), block_shape=(4, 4))