# Add Constant - CPU Kernel Operation
This script demonstrates the fundamental concept of kernel operations by adding a constant value to each element of an array.

A kernel is a function applied to every element of data independently. This independence is what makes GPU acceleration possible.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import time

## Preparing Data

In [None]:
data = np.arange(8, dtype=np.float32)
constant = 10.0

## Sequential Implementation

In [None]:
def add_constant_sequential(data, constant):
    """
    Sequential implementation processes one element at a time.
    This demonstrates how a CPU would execute the kernel.
    """
    result = np.zeros_like(data)
    
    # process each element individually
    for i in range(len(data)):
        # apply the kernel function to element i
        result[i] = data[i] + constant
   
    return result

In [None]:
print(f"Input array:  {data}")
result = add_constant_sequential(data, constant)
print(f"Output array: {result}")

## Simulating Parallel Processing

In [None]:
def simulate_parallel_processing(data, constant, num_threads=4):
    """
    Simulates how parallel processing would work on a GPU.
    Each thread processes a portion of the data independently.
    """
    if num_threads > data.size:
        raise ValueError("Number of threads greater than number of elements. "
                         "Not supported in this simulation.")
    
    result = np.zeros_like(data)
    elements_per_thread = len(data) // num_threads
    
    # simulate each thread processing its assigned elements
    thread_assignments = []
    
    for thread_id in range(num_threads):
        # calculate which elements this thread processes
        start_idx = thread_id * elements_per_thread
        if thread_id == num_threads - 1:
            # last thread handles any remaining elements for simplicity
            end_idx = len(data)
        else:
            end_idx = start_idx + elements_per_thread
        
        # record assignment for visualisation
        thread_assignments.append((thread_id, start_idx, end_idx))
        
        # process this thread's elements
        for i in range(start_idx, end_idx):
            result[i] = data[i] + constant
    
    return result, thread_assignments

In [None]:
result_parallel, assignments = simulate_parallel_processing(data, constant, num_threads=4)

print(f"Using {len(assignments)} threads:")
for thread_id, start, end in assignments:
    print(f"  Thread {thread_id}: processes elements {start}-{end-1}")
print(f"Result: {result_parallel}\n")

## Visualise Parallel Processing

In [None]:
def visualise_processing(array_size=16, num_threads=4):
    """
    Create a visual representation of sequential vs parallel processing.
    """
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 6))
    
    # sequential processing visualisation
    ax1.set_title("Sequential Processing (CPU)", fontsize=12)
    ax1.set_xlim(-0.5, array_size - 0.5)
    ax1.set_ylim(-0.5, 1.5)
    ax1.set_xlabel("Array Index")
    ax1.set_ylabel("Time")
    
    # show elements being processed one by one
    for i in range(array_size):
        rect = plt.Rectangle((i - 0.4, 0), 0.8, 1,
                           facecolor="lightblue", 
                           edgecolor="black")
        ax1.add_patch(rect)
        ax1.text(i, 0.5, str(i), ha="center", va="center")
    
    # add arrow showing sequential flow
    ax1.annotate("", xy=(array_size - 0.5, 0.5), 
                xytext=(-0.5, 0.5),
                arrowprops=dict(arrowstyle="->", color="red", lw=2))
    ax1.text(array_size/2, -0.3, "Processes one element at a time",
            ha="center", color="red")
    
    # parallel processing visualisation
    ax2.set_title("Parallel Processing (GPU)", fontsize=12)
    ax2.set_xlim(-0.5, array_size - 0.5)
    ax2.set_ylim(-0.5, num_threads + 0.5)
    ax2.set_xlabel("Array Index")
    ax2.set_ylabel("Thread ID")
    
    # colour map for threads
    colours = plt.cm.Set3(np.linspace(0, 1, num_threads))
    
    # show elements assigned to each thread
    elements_per_thread = array_size // num_threads
    for thread_id in range(num_threads):
        start_idx = thread_id * elements_per_thread
        if thread_id == num_threads - 1:
            end_idx = array_size
        else:
            end_idx = start_idx + elements_per_thread
        
        for i in range(start_idx, end_idx):
            rect = plt.Rectangle((i - 0.4, thread_id + 0.1), 0.8, 0.8,
                               facecolor=colours[thread_id], 
                               edgecolor="black", alpha=0.7)
            ax2.add_patch(rect)
            ax2.text(i, thread_id + 0.5, str(i), 
                    ha="center", va="center", fontsize=10)
    
    # add thread labels
    for thread_id in range(num_threads):
        ax2.text(-1, thread_id + 0.5, f"Thread {thread_id}",
                ha="right", va="center")
    
    ax2.text(array_size/2, -0.3, 
            "All threads process simultaneously",
            ha="center", color="green")
    
    plt.tight_layout()
    plt.show()

In [None]:
visualise_processing(array_size=16, num_threads=4)

## Performance Comparison

In [None]:
def benchmark_implementations(sizes):
    """
    Compare performance of sequential and vectorised implementations.
    """
    results = {
        "sequential": [],
        "vectorised": [],
        "sizes": sizes
    }
    constant = 42.0
    
    print("\nPerformance Comparison")
    print("=" * 50)
    print(f"{'Array Size':>12} {'Sequential':>12} {'Vectorised':>12} {'Speed-up':>12}")
    print("-" * 50)

    rng = np.random.default_rng()
    for size in sizes:
        # create test data
        data = rng.random(size).astype(np.float32)
        
        # sequential implementation
        start = time.perf_counter()
        result_seq = add_constant_sequential(data, constant)
        time_seq = time.perf_counter() - start
        results["sequential"].append(time_seq)
        
        # time vectorised implementation
        start = time.perf_counter()
        result_vec = data + constant
        time_vec = time.perf_counter() - start
        results["vectorised"].append(time_vec)
        
        # verify results match
        assert np.allclose(result_seq, result_vec)
        
        print(f"{size:>12,} {time_seq:>12.6f} {time_vec:>12.6f} {time_seq/time_vec:>12.2f}x")
    
    return results

In [None]:
sizes = [1000, 10000, 100000, 1000000]
results = benchmark_implementations(sizes)