In [None]:
import torch
import time

def benchmark_matmul(max_pow=12, dtype=torch.float32):
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available!")

    device = torch.device("cuda")
    print(f"\nRunning on: {torch.cuda.get_device_name(device)}\n")

    print(f"{'Matrix Size (N x N)':>20} | {'Execution Time (ms)':>20}")
    print("-" * 45)

    for power in range(6, max_pow + 1):
        N = 2 ** power

        # Generate random matrices on GPU
        A = torch.randn((N, N), dtype=dtype, device=device)
        B = torch.randn((N, N), dtype=dtype, device=device)

        # Warm-up to ensure fair timing
        for _ in range(3):
            _ = A @ B

        # Use CUDA events for accurate GPU timing
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

        start.record()
        C = A @ B  # Matrix multiplication using cuBLAS
        end.record()

        # Wait for the events to complete
        torch.cuda.synchronize()

        # Measure elapsed time in milliseconds
        elapsed_time_ms = start.elapsed_time(end)

        print(f"{N:>20} | {elapsed_time_ms:>20.3f}")

if __name__ == "__main__":
    benchmark_matmul(max_pow=16)  # Runs for sizes from 64x64 up to 16384x16384
