# Benchmarking & Profiling

In [1]:
import torch
from typing import Callable
from torch.profiler import ProfilerActivity

def get_device(index: int = 0) -> torch.device:
    """Try to use the GPU if possible, otherwise, use CPU."""
    if torch.cuda.is_available():
        return torch.device(f"cuda:{index}")
    else:
        return torch.device("cpu")
 
def run_operation1(dim: int, operation: Callable) -> Callable:
    # Setup: create one random dim x dim matrices
    x = torch.randn(dim, dim, device=get_device())
    # Return a function to perform the operation
    return lambda : operation(x)

def run_operation2(dim: int, operation: Callable) -> Callable:
    # Setup: create two random dim x dim matrices
    x = torch.randn(dim, dim, device=get_device())
    y = torch.randn(dim, dim, device=get_device())
    # Return a function to perform the operation
    return lambda : operation(x, y)

def profile(description: str, run: Callable, num_warmups: int = 1, with_stack: bool = False):
    # Warmup
    for _ in range(num_warmups):
        run()
    if torch.cuda.is_available():
        torch.cuda.synchronize()  # Wait for CUDA threads to finish (important!)
        
    # Run the code with the profiler
    with torch.profiler.profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            # Output stack trace for visualization
            with_stack=with_stack,
            # Needed to export stack trace for visualization
            experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True)) as prof:
        run()
        if torch.cuda.is_available():
            torch.cuda.synchronize()  # Wait for CUDA threads to finish (important!)
        
    # Print out table
    table = prof.key_averages().table(sort_by="cuda_time_total",
                                      max_name_column_width=80,
                                      row_limit=10)
    #text(f"## {description}")
    #text(table, verbatim=True)
    # Write stack trace visualization
    if with_stack:
        text_path = f"var/stacks_{description}.txt"
        svg_path = f"var/stacks_{description}.svg"
        prof.export_stacks(text_path, "self_cuda_time_total")
    return table

In [2]:
matmul_function_128 = lambda a, b: a @ b
matmul_profile_128 = profile("matmul(dim=128)", run_operation2(dim=128, operation=matmul_function_128))

In [3]:
print(matmul_profile_128)

-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     aten::matmul        82.12%      29.066ms        99.92%      35.368ms      35.368ms       0.000us         0.00%       6.016us       6.016us             1  
                                         aten::mm        17.52%       6.203ms        17.81%       6.303ms       6.303ms       6.016us       100.00%       6.016us       6.016us             1  
                  ampere_sgemm_32x32_sl

In [4]:
gelu_function = lambda a, b: torch.nn.functional.gelu(a + b)
gelu_profile = profile("gelu", run_operation2(dim=2048, operation=gelu_function))

In [5]:
print(gelu_profile)

--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
--------------------------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                                       aten::add        92.90%       3.919ms        94.20%       3.973ms       3.973ms      60.704us        59.58%      60.704us      60.704us             1  
void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add...         0.00%       0.000u

# benchmarking_script

In [10]:
_ =  """
(a) Write a script to perform basic end-to-end benchmarking of the forward and backward passes in
your model. Specifically, your script should support the following:
• Given hyperparameters (e.g., number of layers), initialize a model.
• Generate a random batch of data.
• Run w warm-up steps (before you start measuring time), then time the execution of n steps
(either only forward, or both forward and backward passes, depending on an argument). For
timing, you can use the Python timeit module (e.g., either using the `timeit` function, or
using `timeit.default_timer()`, which gives you the system's highest resolution clock, thus
a better default for benchmarking than `time.time ()`).
• Call torch. cuda.synchronize () after each step.
Deliverable: A script that will initialize a basics Transformer model with the given hyperpa-
rameters, create a random batch of data, and time forward and backward passes.
"""

In [1]:
import cs336_basics as lib
import cs336_basics.model as nn
import torch
from typing import Callable

In [66]:
def mean(x: list[float]) -> float:
    return sum(x) / len(x)

def benchmark(description: str, run: Callable, num_warmups: int = 1, num_trials: int = 3):
    """Benchmark `func` by running it `num_trials`, and return all the times."""
    # Warmup: first times might be slower due to compilation, things not cached.
    # Since we will run the kernel multiple times, the timing that matters is steady state.
    for _ in range(num_warmups):
        run()
    if torch.cuda.is_available():
        torch.cuda.synchronize()  # Wait for CUDA threads to finish (important!)
    # Time it for real now!
    times: list[float] = [] # @inspect times, @inspect description
    for trial in range(num_trials):  # Do it multiple times to capture variance
        start_time = time.time()
        run()  # Actually perform computation
        if torch.cuda.is_available():
            torch.cuda.synchronize()  # Wait for CUDA threads to finish (important!)
        end_time = time.time()
        times.append((end_time - start_time) * 1000) # @inspect times
    mean_time = mean(times) # @inspect mean_time
    return mean_time

def benchmark_model(
    model,
    batch_size: int = 1,
    sequence_length: int = 64,
    end_to_end: bool = False,
    num_warmups: int = 1, 
    num_trials: int = 3
):
    # Initialization
    vocab_size = model.vocab_size
    X = torch.randint(high=vocab_size, size=(batch_size, sequence_length))
    X = X.to(next(model.parameters()).device)
    Y = torch.ones(vocab_size).to(next(model.parameters()).device)

    def forward():
        with torch.no_grad(): model(X)

    def backward():
        output = model(X)
        loss = ((Y - output) ** 2).sum()
        loss.backward()

    run = backward if end_to_end else forward
    for _ in range(num_warmups):
        run()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    times: list[float] = []
    for trial in range(num_trials):
        start_time = timeit.default_timer()
        run()
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        end_time = timeit.default_timer()
        times.append((end_time - start_time) * 1000)
        
    mean_time = mean(times)
    return mean_time

In [2]:
# Model configuration dictionaries
configs = {
    'small': {
        'vocab_size': 50257,
        'context_length': 2048,
        'd_model': 768,
        'num_layers': 12,
        'num_heads': 12,
        'd_ff': 3072,
        'rope_theta': 10000.0,
    },
    
    'medium': {
        'vocab_size': 50257,
        'context_length': 2048,
        'd_model': 1024,
        'num_layers': 24,
        'num_heads': 16,
        'd_ff': 4096,
        'rope_theta': 10000.0,
    },
    
    'large': {
        'vocab_size': 50257,
        'context_length': 2048,
        'd_model': 1280,
        'num_layers': 36,
        'num_heads': 20,
        'd_ff': 5120,
        'rope_theta': 10000.0,
    },
    
    'xl': {
        'vocab_size': 50257,
        'context_length': 2048,
        'd_model': 1600,
        'num_layers': 48,
        'num_heads': 25,
        'd_ff': 6400,
        'rope_theta': 10000.0,
    },
    
    '2.7B': {
        'vocab_size': 50257,
        'context_length': 2048,
        'd_model': 2560,
        'num_layers': 32,
        'num_heads': 32,
        'd_ff': 10240,
        'rope_theta': 10000.0,
    }
}


In [3]:
model = nn.BasicsTransformerLM(**configs["small"])

In [44]:
model = model.to("cuda:0")

In [75]:
benchmark_model(
    model,
    batch_size=5,
    sequence_length=32,
    end_to_end=True,
    num_warmups=5,
    num_trials=10
)

74.16990557685494

In [None]:
for warmups in 0 5; do
    for end_to_end in "" "--end-to-end"; do
        echo "Running: warmups=$warmups, end_to_end=${end_to_end:-False}"
        
        python cs336_systems/measure.py \
            --num-warmups $warmups \
            $end_to_end \
            --quiet \
            --log-level INFO
    done
done