In [None]:
import json
import glob
import pandas as pd
import re
import matplotlib.pyplot as plt

from collections import OrderedDict

In [None]:
files = glob.glob("output/**/*summary.json", recursive=True)
len(files)

In [None]:
data = []

for file in files:
    with open(file, 'r') as f:
        json_data = json.load(f)
        json_data['tp'] = 1
        if re.search(r'tp(\d+)', file):
            json_data['tp'] = int(re.search(r'tp(\d+)', file).group(1))
        json_data['file'] = file

        data.append(json_data)

df = pd.DataFrame(data)
df = df.sort_values(['engine', 'tp', 'dtype', 'num_concurrent_requests'])
df.to_csv('llm_load_test_results.csv', index=False)

TP = 1
- mean tput per request
- p95 ttft
- p95 tpop
- e2e latency

In [None]:
subset.engine.unique()

In [None]:
df.query("tp == 4 and engine in ['trtllm', 'vllm'] and dtype == 'bf16'")[['engine', 'num_concurrent_requests', 'results_request_output_throughput_token_per_s_mean']]

In [None]:
plt.figure(figsize=(16, 12))

tp_size = 4

metrics = [
    "results_request_output_throughput_token_per_s_mean", 
    "results_inter_token_latency_s_quantiles_p95",
    "results_ttft_s_quantiles_p95", 
    # "results_mean_output_throughput_token_per_s"
    "results_end_to_end_latency_s_quantiles_p95",
]

titles = [
    "Mean throughput vs Concurrency",
    "Inter-token Latency (p95) vs Concurrency",
    "TTFT (p95) vs Concurrency",
    # "Mean total throughput vs Concurrency"
    "End-to-End Latency (p95) vs Concurrency"
]

y_labels = [
    "Throughput (tokens/s)",
    "Inter-token Latency (s)",
    "TTFT (s)",
    # "Throughput (tokens/s)"
    "End-to-End Latency"
]

thresholds = {
    "results_request_output_throughput_token_per_s_mean": 40,
    "results_inter_token_latency_s_quantiles_p95": 0.025,
    "results_ttft_s_quantiles_p95": 0.2
}

for i, (metric, title, y_label) in enumerate(zip(metrics, titles, y_labels), 1):
    plt.subplot(2, 2, i)
    
    for engine in df["engine"].unique():
        subset = df.query("tp == @tp_size and engine == @engine").copy()
        subset_bf16 = subset.query("dtype=='bf16'")
        subset_fp8 = subset.query("dtype=='fp8'")

        bf16_line, = plt.plot(subset_bf16["num_concurrent_requests"], subset_bf16[metric], marker='o', label=f"engine={engine}, dtype=bf16")

        if len(subset_fp8):
            fp8_line, = plt.plot(subset_fp8["num_concurrent_requests"], subset_fp8[metric], marker='o', label=f"engine={engine}, dtype=fp8")

    
    # Add horizontal threshold lines
    if metric in thresholds:
        plt.axhline(y=thresholds[metric], color='r', linestyle='--', 
                   label=f"Threshold: {thresholds[metric]}" + (" tokens/s" if metric == metrics[0] else " s"))
    
    plt.xlabel("Number of Concurrent Requests")
    plt.ylabel(y_label)
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.title(f"TP={tp_size}, {title}")

    if metric in ['results_inter_token_latency_s_quantiles_p95', 'results_ttft_s_quantiles_p95', 'results_end_to_end_latency_s_quantiles_p95']:
        plt.yscale("log")

    plt.xscale("log")
    
    plt.legend(title="Engine and dtype")

plt.tight_layout()
plt.savefig(f'tp{tp_size}_comparisons.png')
plt.show()

In [None]:
subset.results_request_output_throughput_token_per_s_mean

In [None]:
plt.figure(figsize=(16, 12))

metrics = [
    "results_request_output_throughput_token_per_s_mean", 
    "results_inter_token_latency_s_quantiles_p95",
    "results_ttft_s_quantiles_p95", 
    # "results_end_to_end_latency_s_quantiles_p95"
    "results_mean_output_throughput_token_per_s"
]

titles = [
    "Throughput vs Concurrency",
    "Inter-token Latency (p95) vs Concurrency",
    "TTFT (p95) vs Concurrency",
    # "End-to-End Latency (p95) vs Concurrency"
    "Mean total throughput vs Concurrency"
]

y_labels = [
    "Throughput (tokens/s)",
    "Inter-token Latency (s)",
    "TTFT (s)",
    # "End-to-End Latency (s)"
    "Throughput (tokens/s)"
]

thresholds = {
    "results_request_output_throughput_token_per_s_mean": 40,
    "results_inter_token_latency_s_quantiles_p95": 0.025,
    "results_ttft_s_quantiles_p95": 0.2
}

for i, (metric, title, y_label) in enumerate(zip(metrics, titles, y_labels), 1):
    plt.subplot(2, 2, i)
    
    for tp in df["tp"].unique():
        subset = df.query("engine == 'vllm' and tp == @tp").copy()
        subset_bf16 = subset.query("dtype=='bf16'")

        plt.plot(subset_bf16["num_concurrent_requests"], subset_bf16[metric], marker='o', label=f"TP={tp}, dtype=bf16")

    # Add horizontal threshold lines
    if metric in thresholds:
        plt.axhline(y=thresholds[metric], color='r', linestyle='--', 
                   label=f"Threshold: {thresholds[metric]}" + (" tokens/s" if metric == metrics[0] else " s"))

    plt.xlabel("Number of Concurrent Requests")
    plt.ylabel(y_label)
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.title(f"vLLM {title}")

    if metric in ['results_inter_token_latency_s_quantiles_p95', 'results_ttft_s_quantiles_p95', 'results_end_to_end_latency_s_quantiles_p95']:
        plt.yscale("log")

    plt.xscale("log")
    
    plt.legend(title="Engine and dtype")

plt.tight_layout()
# plt.savefig('vllm_comparisons.png')
plt.show()

In [None]:
df.query("engine=='trtllm_lookahead'")[['num_concurrent_requests'] + metrics]

In [None]:
cols = [c for c in df.columns if 'number_input_tokens' in c or 'number_output_tokens' in c]