# Basics of Triton

In [9]:
!nvidia-smi
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
import torch
import triton
import triton.language as tl
print(torch.cuda.is_available())


Sun Sep 21 05:37:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   58C    P0             30W /   70W |     680MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### triton kernel itself

In [10]:
@triton.jit     # this is a GPU kernel that needs to be launched by Triton
def vector_add_kernel(
    A,  # 32-bit pointer to input vector A
    B,
    C,  # 32-bit pointer to output vector B
    N,  # total number of elements
    BLOCK_SIZE: tl.constexpr      # compile time constant; how many elements does a single program handle
):
  # identify the program we are in along the 0-axis | program here is a CUDA-like block
  pid = tl.program_id(axis=0)

  offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
  # tl.arange() generates a tensor which stays constant throughout this process and works as a local indexor for each offset
  # pid - program id, here, keeps incrementing
  # BLOCK_SIZE stays constant

  # compute a boolean mask to guard against out-of-bounds when N is not a multiple of BLOCK_SIZE

  # the significance of a mask is to book-keep which workers/ GPU blocks are valid/active and which are not! [true, true, true, false, false]
  mask = offsets < N

  # load A[offsets] and B[offsets]
  a = tl.load(A + offsets, mask=mask)    # the reason why we add offsets to the picture is because we want to index just the right numbers
  b = tl.load(B + offsets, mask=mask)

  # element wise addition happens on registers
  tl.store(C + offsets, a + b, mask=mask)

### helper for benchmarking

In [11]:
def time_op_gpu(fn, sync=True, warmup=5, iters =20):

  # warmup does JIT and warms caches
  for _ in range(warmup):
    fn()
  if sync:
    torch.cuda.synchronize()

  # Event is just a CUDA synchronization marker, and we have two, for start and end
  start = torch.cuda.Event(enable_timing=True)
  end = torch.cuda.Event(enable_timing=True)
  elapsed_ms = 0.0
  for _ in range(iters):
    start.record()
    fn()
    end.record()

    # wait for the events to be recorded and measure GPU time
    torch.cuda.synchronize()
    elapsed_ms += start.elapsed_time(end)
  return elapsed_ms / iters

### driver main logic

In [12]:
def main():
  assert torch.cuda.is_available(), "CUDA device not found, please run on a machine with NVIDIA GPU"

  N = 1 << 24  # 16 million elements

  # allocation of I/O directly on the GPU
  a = torch.rand(N, device="cuda", dtype=torch.float32)
  b = torch.rand(N, device="cuda", dtype=torch.float32)
  c = torch.empty_like(a)

  BLOCK_SIZE = 1024

  grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)

  # 3.1) Correctness Check
  vector_add_kernel[grid](a, b, c, N, BLOCK_SIZE=BLOCK_SIZE)
  # Synchronize to ensure kernel has finished before checking results.
  torch.cuda.synchronize()
  ok = torch.allclose(c, a + b, rtol=1e-5, atol=1e-6)
  print(f"[Correctness] Triton result matches PyTorch: {ok}")

  # 3.2) Triton v PyTorch
  def launch_triton():
        vector_add_kernel[grid](a, b, c, N, BLOCK_SIZE=BLOCK_SIZE)

  # PyTorch baseline
  def launch_torch():
        # We write to c to emulate the same output pattern as Triton; in practice you might reuse ref = a + b.
        c.copy_(a + b)
  triton_ms = time_op_gpu(launch_triton)
  torch_ms = time_op_gpu(launch_torch)

  # Throughput: number of bytes moved per second.
  # Each element reads A[i] and B[i] (2 * 4 bytes) and writes C[i] (4 bytes) => 12 bytes/element.
  bytes_moved = N * 12
  triton_bw = bytes_moved / (triton_ms / 1e3) / 1e9  # GB/s
  torch_bw = bytes_moved / (torch_ms / 1e3) / 1e9    # GB/s

  print(f"[Perf] Triton: {triton_ms:.3f} ms  (~{triton_bw:.1f} GB/s)")
  print(f"[Perf] PyTorch: {torch_ms:.3f} ms  (~{torch_bw:.1f} GB/s)")

    # 3.3) (Optional) Try different BLOCK_SIZE to see the effect on performance
    # for bs in [128, 256, 512, 1024, 2048]:
    #     grid2 = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)
    #     def launch():
    #         vector_add_kernel[grid2](a, b, c, N, BLOCK_SIZE=bs)
    #     t_ms = time_op_gpu(launch)
    #     bw = bytes_moved / (t_ms / 1e3) / 1e9
    #     print(f"BLOCK_SIZE={bs:4d}  ->  {t_ms:.3f} ms  (~{bw:.1f} GB/s)")

if __name__ == "__main__":
    main()

[Correctness] Triton result matches PyTorch: True
[Perf] Triton: 0.855 ms  (~235.5 GB/s)
[Perf] PyTorch: 1.377 ms  (~146.2 GB/s)
