In [2]:
!pip install triton

Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.1.0


In [3]:
import triton
import triton.language as tl
import torch
import numpy as np

In [4]:
@triton.jit
def vector_add_kernel(
    x_ptr,  # Pointer to first input vector
    y_ptr,  # Pointer to second input vector
    output_ptr,  # Pointer to output vector
    n_elements,  # Size of vectors
    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process
):
    # Get program ID
    pid = tl.program_id(axis=0)
    
    # Calculate start index for this program
    block_start = pid * BLOCK_SIZE
    
    # Create offset array for this block
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    
    # Create mask for valid elements
    mask = offsets < n_elements
    
    # Load x and y vectors
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    
    # Perform addition
    output = x + y
    
    # Store result
    tl.store(output_ptr + offsets, output, mask=mask)

In [5]:
# Wrapper function to handle the kernel launch
def add_vectors(x: torch.Tensor, y: torch.Tensor):
    # Assert inputs are same size and on GPU
    assert x.shape == y.shape
    assert x.is_cuda and y.is_cuda
    
    # Get vector size
    n_elements = x.numel()
    
    # Create output tensor
    output = torch.empty_like(x)
    
    # Calculate grid size
    BLOCK_SIZE = 1024
    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
    
    # Launch kernel
    vector_add_kernel[grid](
        x_ptr=x,
        y_ptr=y,
        output_ptr=output,
        n_elements=n_elements,
        BLOCK_SIZE=BLOCK_SIZE,
    )
    
    return output

In [7]:
# Example usage
def main():
    # Set problem size
    N = 1_000_000
    
    # Create input vectors on CPU
    x_cpu = torch.randn(N)
    y_cpu = torch.randn(N)
    
    # Move vectors to GPU
    x_gpu = x_cpu.cuda()
    y_gpu = y_cpu.cuda()
    
    # Run Triton kernel
    output_gpu = add_vectors(x_gpu, y_gpu)
    
    # Verify results
    output_cpu = x_cpu + y_cpu
    output_triton = output_gpu.cpu()
    
    print("Max difference:", torch.max(torch.abs(output_cpu - output_triton)))
    print("Correct:", torch.allclose(output_cpu, output_triton))

if __name__ == "__main__":
    main()

Max difference: tensor(0.)
Correct: True
