<a href="https://colab.research.google.com/github/simar-rekhi/triton/blob/main/Neel_Triton_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install torch triton



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import torch, triton
print(torch.cuda.is_available())
# it has to be True, triton does not support CPU!

True


In [8]:
#Performing operation alpha * A + B = C, where A and B are vectors and alpha is a constant.
#Attempt 1
import torch
import triton
import triton.language as tl

@triton.jit
def add_scaled_vector_kernel(A, B, C, N, alpha, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(0)
    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    mask = offsets < N
    a = tl.load(A + offsets, mask=mask)
    b = tl.load(B + offsets, mask=mask)
    a = a * alpha
    c = a + b
    tl.store(C + offsets, c, mask=mask)

# ---------------------------
# 1) The Triton kernel itself
# ---------------------------

# ---------------------------------
# 2) A small helper for benchmarking
# ---------------------------------
def time_op_gpu(fn, sync=True, warmup=5, iters=20):
    """
    Time a GPU operation using CUDA events for better accuracy (no CPU scheduling noise).
    - fn: a callable that launches GPU work
    - sync: whether to synchronize after each iteration (True recommended)
    - warmup: warm-up iterations to let JIT/caches settle
    - iters: timed iterations

    Returns: average time in milliseconds over 'iters' runs.
    """
    # warm-up does JIT and warms caches
    for _ in range(warmup):
        fn()
    if sync:
        torch.cuda.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    elapsed_ms = 0.0
    for _ in range(iters):
        start.record()
        fn()
        end.record()
        # Wait for the events to be recorded & measure GPU time
        torch.cuda.synchronize()
        elapsed_ms += start.elapsed_time(end)
    return elapsed_ms / iters


# ---------------
# 3) Driver code
# ---------------
def main():
    assert torch.cuda.is_available(), "CUDA device not found. Please run on a machine with an NVIDIA GPU."
    print("Testing vector addition with scaled vector:")
    N = 1 << 24


    a = torch.rand(N, device="cuda", dtype=torch.float32)
    b = torch.rand(N, device="cuda", dtype=torch.float32)
    c = torch.empty_like(a)
    alpha = 3

    BLOCK_SIZE = 1024

    grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)

    add_scaled_vector_kernel[grid](a, b, c, N, alpha, BLOCK_SIZE=BLOCK_SIZE)
    torch.cuda.synchronize()
    ok = torch.allclose(c, a + b, rtol=1e-5, atol=1e-6)
    print(f"[Correctness] Triton result matches PyTorch: {ok}")


    def launch_triton():
        add_scaled_vector_kernel[grid](a, b, c, N, alpha, BLOCK_SIZE=BLOCK_SIZE)

    def launch_torch():
        c.copy_(a + b)

    triton_ms = time_op_gpu(launch_triton)
    torch_ms = time_op_gpu(launch_torch)


    bytes_moved = N * 12
    triton_bw = bytes_moved / (triton_ms / 1e3) / 1e9  # GB/s
    torch_bw = bytes_moved / (torch_ms / 1e3) / 1e9    # GB/s

    print(f"[Perf] Triton: {triton_ms:.3f} ms  (~{triton_bw:.1f} GB/s)")
    print(f"[Perf] PyTorch: {torch_ms:.3f} ms  (~{torch_bw:.1f} GB/s)")




if __name__ == "__main__":
    main()




Testing vector addition with scaled vector:
[Correctness] Triton result matches PyTorch: False
[Perf] Triton: 0.894 ms  (~225.3 GB/s)
[Perf] PyTorch: 1.444 ms  (~139.4 GB/s)


In [9]:
#Performing operation alpha * A + B = C, where A and B are vectors and alpha is a constant.
#Attempt 2
import torch
import triton
import triton.language as tl

@triton.jit
def add_scaled_vector_kernel(A, B, C, N, alpha, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(0)
    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    mask = offsets < N
    a = tl.load(A + offsets, mask=mask)
    b = tl.load(B + offsets, mask=mask)
    a = a * alpha
    c = a + b
    tl.store(C + offsets, c, mask=mask)

# ---------------------------
# 1) The Triton kernel itself
# ---------------------------

# ---------------------------------
# 2) A small helper for benchmarking
# ---------------------------------
def time_op_gpu(fn, sync=True, warmup=5, iters=20):
    """
    Time a GPU operation using CUDA events for better accuracy (no CPU scheduling noise).
    - fn: a callable that launches GPU work
    - sync: whether to synchronize after each iteration (True recommended)
    - warmup: warm-up iterations to let JIT/caches settle
    - iters: timed iterations

    Returns: average time in milliseconds over 'iters' runs.
    """
    # warm-up does JIT and warms caches
    for _ in range(warmup):
        fn()
    if sync:
        torch.cuda.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    elapsed_ms = 0.0
    for _ in range(iters):
        start.record()
        fn()
        end.record()
        # Wait for the events to be recorded & measure GPU time
        torch.cuda.synchronize()
        elapsed_ms += start.elapsed_time(end)
    return elapsed_ms / iters


# ---------------
# 3) Driver code
# ---------------
def main():
    assert torch.cuda.is_available(), "CUDA device not found. Please run on a machine with an NVIDIA GPU."
    print("Testing vector addition with scaled vector:")
    N = 1 << 24


    a = torch.rand(N, device="cuda", dtype=torch.float32)
    b = torch.rand(N, device="cuda", dtype=torch.float32)
    c = torch.empty_like(a)
    alpha = 3

    BLOCK_SIZE = 1024

    grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)

    add_scaled_vector_kernel[grid](a, b, c, N, alpha, BLOCK_SIZE=BLOCK_SIZE)
    torch.cuda.synchronize()
    ok = torch.allclose(c, a * alpha + b, rtol=1e-5, atol=1e-6)
    print(f"[Correctness] Triton result matches PyTorch: {ok}")


    def launch_triton():
        add_scaled_vector_kernel[grid](a, b, c, N, alpha, BLOCK_SIZE=BLOCK_SIZE)

    def launch_torch():
        c.copy_(a * alpha + b)

    triton_ms = time_op_gpu(launch_triton)
    torch_ms = time_op_gpu(launch_torch)


    bytes_moved = N * 12
    triton_bw = bytes_moved / (triton_ms / 1e3) / 1e9  # GB/s
    torch_bw = bytes_moved / (torch_ms / 1e3) / 1e9    # GB/s

    print(f"[Perf] Triton: {triton_ms:.3f} ms  (~{triton_bw:.1f} GB/s)")
    print(f"[Perf] PyTorch: {torch_ms:.3f} ms  (~{torch_bw:.1f} GB/s)")




if __name__ == "__main__":
    main()




Testing vector addition with scaled vector:
[Correctness] Triton result matches PyTorch: True
[Perf] Triton: 0.893 ms  (~225.5 GB/s)
[Perf] PyTorch: 2.007 ms  (~100.3 GB/s)


In [10]:
#Performing element-wise matrix multiplication. Each element in A is multiplied by its corresponding value in B, and the result is stored in C.
#Attempt 1
import torch
import triton
import triton.language as tl

@triton.jit
def element_wise_matrix_mult_kernel(A, B, C, N, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(0)
    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    mask = offsets < N
    a = tl.load(A + offsets, mask=mask)
    b = tl.load(B + offsets, mask=mask)
    c = a * b
    tl.store(C + offsets, c, mask=mask)

# ---------------------------
# 1) The Triton kernel itself
# ---------------------------

# ---------------------------------
# 2) A small helper for benchmarking
# ---------------------------------
def time_op_gpu(fn, sync=True, warmup=5, iters=20):
    """
    Time a GPU operation using CUDA events for better accuracy (no CPU scheduling noise).
    - fn: a callable that launches GPU work
    - sync: whether to synchronize after each iteration (True recommended)
    - warmup: warm-up iterations to let JIT/caches settle
    - iters: timed iterations

    Returns: average time in milliseconds over 'iters' runs.
    """
    # warm-up does JIT and warms caches
    for _ in range(warmup):
        fn()
    if sync:
        torch.cuda.synchronize()

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    elapsed_ms = 0.0
    for _ in range(iters):
        start.record()
        fn()
        end.record()
        # Wait for the events to be recorded & measure GPU time
        torch.cuda.synchronize()
        elapsed_ms += start.elapsed_time(end)
    return elapsed_ms / iters


# ---------------
# 3) Driver code
# ---------------
def main():
    assert torch.cuda.is_available(), "CUDA device not found. Please run on a machine with an NVIDIA GPU."
    print("Testing element-wise matrix multiplication:")
    N = 1 << 24


    a = torch.rand(N, device="cuda", dtype=torch.float32)
    b = torch.rand(N, device="cuda", dtype=torch.float32)
    c = torch.empty_like(a)

    BLOCK_SIZE = 1024

    grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)

    element_wise_matrix_mult_kernel[grid](a, b, c, N, BLOCK_SIZE=BLOCK_SIZE)
    torch.cuda.synchronize()
    ok = torch.allclose(c, a * b, rtol=1e-5, atol=1e-6)
    print(f"[Correctness] Triton result matches PyTorch: {ok}")


    def launch_triton():
        element_wise_matrix_mult_kernel[grid](a, b, c, N, BLOCK_SIZE=BLOCK_SIZE)

    def launch_torch():
        c.copy_(a * b)

    triton_ms = time_op_gpu(launch_triton)
    torch_ms = time_op_gpu(launch_torch)


    bytes_moved = N * 12
    triton_bw = bytes_moved / (triton_ms / 1e3) / 1e9  # GB/s
    torch_bw = bytes_moved / (torch_ms / 1e3) / 1e9    # GB/s

    print(f"[Perf] Triton: {triton_ms:.3f} ms  (~{triton_bw:.1f} GB/s)")
    print(f"[Perf] PyTorch: {torch_ms:.3f} ms  (~{torch_bw:.1f} GB/s)")




if __name__ == "__main__":
    main()




Testing element-wise matrix multiplication:
[Correctness] Triton result matches PyTorch: True
[Perf] Triton: 0.889 ms  (~226.5 GB/s)
[Perf] PyTorch: 1.424 ms  (~141.3 GB/s)
