In [1]:
# Install Triton (only needed once in Colab)
!pip install triton

# Import necessary libraries
import torch  # For creating and managing tensors (lists of numbers)
import triton  # For writing GPU code
import triton.language as tl  # For special Triton operations

# Set the GPU device (Colab usually has one GPU, so this is "cuda:0")
DEVICE = torch.device("cuda:0")

# Define the kernel (instructions for GPU workers)
@triton.jit  # Tells Triton to turn this into GPU code
def add_kernel(
    x_ptr,  # Pointer to the first list (x) in GPU memory
    y_ptr,  # Pointer to the second list (y) in GPU memory
    output_ptr,  # Pointer to where we'll store the results
    n_elements,  # Total number of numbers in the lists
    BLOCK_SIZE: tl.constexpr,  # How many numbers each group of workers handles (set later)
):
    # Get the ID of this group of workers (called a "block")
    # Each group gets a unique ID to know which part of the lists to work on
    pid = tl.program_id(axis=0)

    # Calculate where this group should start working in the lists
    # Example: If pid=0 and BLOCK_SIZE=4, start at index 0
    #          If pid=1 and BLOCK_SIZE=4, start at index 4
    block_start = pid * BLOCK_SIZE

    # Create a list of exact positions (indices) this group should work on
    # Example: If block_start=4, offsets=[4, 5, 6, 7] (for BLOCK_SIZE=4)
    offsets = block_start + tl.arange(0, BLOCK_SIZE)

    # Make sure we don't try to work on numbers that don't exist
    # Example: If n_elements=5 but offsets goes up to 7, mask=[True, True, True, True, False, ...]
    mask = offsets < n_elements

    # Load numbers from the first list (x) into fast GPU memory
    # Use the mask to avoid loading numbers beyond n_elements
    x = tl.load(x_ptr + offsets, mask=mask, other=None)

    # Load numbers from the second list (y) into fast GPU memory
    # Use the mask to avoid loading numbers beyond n_elements
    y = tl.load(y_ptr + offsets, mask=mask, other=None)

    # Add the numbers from x and y
    # Example: If x[4]=1 and y[4]=6, then output[4]=7
    output = x + y

    # Save the results back to the output list in GPU memory
    # Use the mask to avoid saving beyond n_elements
    tl.store(output_ptr + offsets, output, mask=mask)

# Define the wrapper (sets up and launches the kernel)
def add(x: torch.Tensor, y: torch.Tensor):
    # Create an empty list (tensor) to store the results
    # It has the same size and type as x
    output = torch.empty_like(x)

    # Make sure all lists are on the GPU
    # If not, show an error with device information
    assert x.device == DEVICE and y.device == DEVICE and output.device == DEVICE, \
        f"Devices don't match: x={x.device}, y={y.device}, output={output.device}, DEVICE={DEVICE}"

    # Get the total number of numbers in the lists
    n_elements = output.numel()

    # Decide how many groups of workers (blocks) we need
    # Example: If n_elements=10 and BLOCK_SIZE=4, we need 3 groups (10/4 = 2.5, rounded up to 3)
    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]), )

    # Launch the kernel on the GPU
    # - Use the grid to decide how many groups to launch
    # - Pass the lists (x, y, output), list length (n_elements), and set BLOCK_SIZE=4
    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=4)

    # Return the result list after the GPU finishes
    return output

# Test the code with a small example
def test_add():
    # Set a random seed for reproducibility
    torch.manual_seed(0)

    # Create two small lists (size=10) on the GPU
    # Example: x might be [0.5, 0.2, ...], y might be [0.7, 0.1, ...]
    size = 10
    x = torch.rand(size, device=DEVICE)
    y = torch.rand(size, device=DEVICE)

    # Run the GPU addition
    z_gpu = add(x, y)

    # Run PyTorch's addition (the correct answer)
    z_cpu = x + y

    # Print the inputs and results
    print("Input x:", x)
    print("Input y:", y)
    print("GPU result:", z_gpu)
    print("PyTorch result:", z_cpu)

    # Check if the GPU result matches PyTorch's result
    # Allow small differences due to floating-point math
    torch.testing.assert_close(z_gpu, z_cpu, atol=1e-3, rtol=1e-3)
    print("Test passed!")

# Run the test
if __name__ == "__main__":
    test_add()

Input x: tensor([0.3990, 0.5167, 0.0249, 0.9401, 0.9459, 0.7967, 0.4150, 0.8203, 0.2290,
        0.9096], device='cuda:0')
Input y: tensor([0.9722, 0.7910, 0.4690, 0.3300, 0.3345, 0.3783, 0.7640, 0.6405, 0.1103,
        0.3594], device='cuda:0')
GPU result: tensor([1.3713, 1.3076, 0.4940, 1.2701, 1.2803, 1.1750, 1.1790, 1.4607, 0.3393,
        1.2689], device='cuda:0')
PyTorch result: tensor([1.3713, 1.3076, 0.4940, 1.2701, 1.2803, 1.1750, 1.1790, 1.4607, 0.3393,
        1.2689], device='cuda:0')
Test passed!
