<a href="https://colab.research.google.com/github/sappyb/Codes-Doc/blob/master/Gaussian_Blur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install -q --system numba-cuda==0.4.0
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [None]:
import numpy as np
from numba import cuda
import matplotlib.pyplot as plt
from scipy.ndimage import convolve  # for CPU reference

# 1. Prepare a test image (you can also load via plt.imread)
height, width = 512, 512
img = np.random.randint(0, 256, size=(height, width)).astype(np.uint8)

# 2. CPU reference: 3×3 Gaussian kernel
gauss3 = np.array([[1, 2, 1],
                   [2, 4, 2],
                   [1, 2, 1]], dtype=np.float32) / 16.0
cpu_blur = convolve(img, gauss3, mode='nearest').astype(np.uint8)

# 3. CUDA kernel: each thread handles one (x,y) pixel
@cuda.jit
def gaussian_blur_kernel(in_img, out_img):
    x, y = cuda.grid(2)
    height, width = in_img.shape

    if x < height and y < width:
        s = 0.0
        # Hard-code the 3×3 weights
        for dy in range(-1, 2):
            for dx in range(-1, 2):
                xi = x + dy
                yj = y + dx
                # clamp to edge for boundary pixels
                if xi < 0:
                    xi = 0
                elif xi >= height:
                    xi = height - 1
                if yj < 0:
                    yj = 0
                elif yj >= width:
                    yj = width - 1

                # mapping (dy,dx) → weight
                if   dy == -1 and dx == -1: w = 1/16
                elif dy == -1 and dx ==  0: w = 2/16
                elif dy == -1 and dx ==  1: w = 1/16
                elif dy ==  0 and dx == -1: w = 2/16
                elif dy ==  0 and dx ==  0: w = 4/16
                elif dy ==  0 and dx ==  1: w = 2/16
                elif dy ==  1 and dx == -1: w = 1/16
                elif dy ==  1 and dx ==  0: w = 2/16
                else:                        w = 1/16

                s += in_img[xi, yj] * w

        out_img[x, y] = np.uint8(s + 0.5)  # round to nearest

# 4. Launch parameters
threads_per_block = (16, 16)
blocks_per_grid_x = (height + threads_per_block[0] - 1) // threads_per_block[0]
blocks_per_grid_y = (width  + threads_per_block[1] - 1) // threads_per_block[1]
blocks_per_grid   = (blocks_per_grid_x, blocks_per_grid_y)

# 5. Copy data to GPU, run kernel, copy back
d_in  = cuda.to_device(img)
d_out = cuda.device_array_like(img)
gaussian_blur_kernel[blocks_per_grid, threads_per_block](d_in, d_out)
cuda.synchronize()
gpu_blur = d_out.copy_to_host()

# 6. Verify correctness
print("Max difference between CPU and GPU:", np.max(np.abs(cpu_blur.astype(int) - gpu_blur.astype(int))))

# 7. Display results
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
ax[0].imshow(img,     cmap='gray'); ax[0].set_title("Original"); ax[0].axis('off')
ax[1].imshow(cpu_blur, cmap='gray'); ax[1].set_title("CPU Gaussian"); ax[1].axis('off')
ax[2].imshow(gpu_blur, cmap='gray'); ax[2].set_title("GPU Gaussian"); ax[2].axis('off')
plt.show()
