In [3]:
import torch
import torch.nn as nn
import time
import pandas as pd
#import ace_tools as tools

# Configuration
depth = 10
width = 1024  # Fixed width to observe effect of batch size
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
runs = 30
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the model
class MLP(nn.Module):
    def __init__(self, dim, depth):
        super().__init__()
        layers = []
        for _ in range(depth):
            layers.append(nn.Linear(dim, dim))
            layers.append(nn.ReLU())
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

# Initialize model
model = MLP(width, depth).to(device).eval()

# Benchmark across batch sizes
results = []
for batch_size in batch_sizes:
    x = torch.randn(batch_size, width, device=device)

    # Warm-up
    for _ in range(5):
        _ = model(x)
    if device.type == 'cuda':
        torch.cuda.synchronize()

    # Timing
    start = time.time()
    for _ in range(runs):
        _ = model(x)
    if device.type == 'cuda':
        torch.cuda.synchronize()
    elapsed = time.time() - start

    avg_batch_latency_ms = (elapsed / runs) * 1000
    avg_per_sample_latency_ms = avg_batch_latency_ms / batch_size
    throughput = 1000 * batch_size / avg_batch_latency_ms  # samples/sec

    results.append({
        "Batch Size": batch_size,
        "Batch Latency (ms)": round(avg_batch_latency_ms, 3),
        "Per-Sample Latency (ms)": round(avg_per_sample_latency_ms, 3),
        "Throughput (samples/sec)": round(throughput, 2)
    })

# Display the results
df = pd.DataFrame(results)
#tools.display_dataframe_to_user("Latency vs Batch Size", df)
df


Unnamed: 0,Batch Size,Batch Latency (ms),Per-Sample Latency (ms),Throughput (samples/sec)
0,1,0.5,0.5,1999.99
1,2,0.367,0.183,5454.59
2,4,0.333,0.083,11998.3
3,8,0.333,0.042,23999.45
4,16,0.367,0.023,43619.67
5,32,0.333,0.01,96000.09
6,64,0.4,0.006,159979.81
7,128,0.403,0.003,317825.55
8,256,0.333,0.001,768073.98
