### RQ1 - Comparing different quantization levels

**Quantization Dimension**

How does quantization in different models and architectures affect system and task-specific metrics.

In [3]:
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(backend, model_name, task, base_path, samples=500, verbose=False, batch_size=100):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/models/{model_name}",
            base_path=base_path,
            task=task,
            verbose=verbose,
        )
        bm.run(samples=samples, batch_size=batch_size)
        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task}")
    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} -- {e}")
        torch.cuda.empty_cache()  # ensure no memory leak on error

2025-05-12 15:48:54.090065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747064934.110013  260255 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747064934.115811  260255 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747064934.135655  260255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747064934.135683  260255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747064934.135685  260255 computation_placer.cc:177] computation placer alr

INFO 05-12 15:48:57 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-12 15:48:57 [__init__.py:239] Automatically detected platform cuda.
[2025-05-12 15:49:00,664] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [7]:
base_path = "/home/ubuntu/fast_llm_inference/"

backends = ["vllm"] #, "huggingface","deepspeed_mii", "llama.cpp"]
models   = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit", # some weird error
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",


    "gemma-2-9b-it-bnb4",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it", # too large
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]

tasks    = ["summarization", "qa", "sql",]

first run

In [None]:
for backend in backends:
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=100,
            )

check if anything is missing

In [12]:
import os

# Define your parameters
backends = ["vllm"]
models = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",
    "gemma-2-9b-it-4bit",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it",
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]
tasks = ["summarization", "qa", "sql"]

results_dir = "./results/experiment_1/"

missing_models = set()

for backend in backends:
    for model in models:
        for task in tasks:
            filename = f"{backend}_{model}_{task}.csv"
            filepath = os.path.join(results_dir, filename)
            if not os.path.exists(filepath):
                missing_models.add(model)
                break  # No need to check more tasks if one is missing

# Print models with missing files
if missing_models:
    print("Models with missing files:")
    for model in sorted(missing_models):
        print(model)
else:
    print("✅ All models are complete.")

Models with missing files:
Qwen2.5-7B-Instruct-8bit
gemma-2-9b-it


try again with the missing models

In [None]:
for backend in backends:
    for model in list(missing_models):
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=500,
                verbose=False,
                batch_size=100,
            )

### RQ2 - Comparing different inference engines

**Framework Dimension** 

Which inference framework (Transformers, vLLM, DeepSpeed MII,172
LMDeploy, llama.cpp) strikes the best balance between system resource usage (e.g., GPU173
utilization, joules/token) and system performance (tokens/s)?174

In [None]:
base_path = "/home/ubuntu/fast_llm_inference/models"

backends = ["vllm", "huggingface", "llama.cpp"] #,"deepspeed_mii", "huggingface"]

models   = [
    "gemma-2-9b-it", 
    "gemma-2-2b-it",

    "llama-3.1-8B-Instruct",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
]

tasks    = ["summarization", "qa", "sql",]

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",

            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
        
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=20,
            )

#### RQ 3 - Comparing different use cases

**Scenario/Workload Dimension**

How do locally deployed LLMs and inference backends
perform and scale across the three dominant inference scenarios—single - stream (single user),
batched offline processing, and multi- user server workloads? Do system metrics – throughput,
GPU utilization, joules/token — evolve as the average number of queries per second varies
over time?

In [1]:
from typing import Optional
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(
    backend: str,
    model_name: str,
    task: str,
    base_path: str,
    scenario: str = "batch",            # "single", "batch", or "server"
    run_time: float = None,             # only for server: total time in seconds
    requests_per_sec: float = None,     # only for server: λ (req/s)
    batch_size: int = 100,              # only for batch
    max_batch_size: int = None,         # only for server: cap per-batch size
    sample_interval: float = 0.1,       # telemetry interval (s)
    export_path: Optional[str] = None,  # custom export path for server scenario
    verbose: bool = False
):
    print(f"Running benchmark for {model_name} with {backend} on {task} [{scenario}]")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/models/{model_name}",
            base_path=base_path,
            task=task,
            verbose=verbose,
        )

        if scenario == "server":
            assert run_time is not None,    "Must set run_time in server mode"
            assert requests_per_sec is not None, "Must set requests_per_sec in server mode"
            df = bm.run(
                scenario="server",
                run_time=run_time,
                requests_per_sec=requests_per_sec,
                sample_interval=sample_interval,
                max_batch_size=max_batch_size,
                export_path=export_path
            )

        elif scenario == "single":
            df = bm.run(
                samples=100,        # samples ignored
                batch_size=1,
                scenario="single",
                sample_interval=sample_interval,
                export_path=export_path
            )

        elif scenario == "batch":
            df = bm.run(
                samples=100,        # samples ignored
                batch_size=batch_size,
                scenario="batch",
                sample_interval=sample_interval,
                export_path=export_path
            )
        else:
            raise ValueError(f"Unknown scenario: {scenario}")

        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task} | {scenario}")
        return df

    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} | {scenario} -- {e}")
        torch.cuda.empty_cache()
        return None


2025-05-16 03:42:40.554419: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747366960.579138 1160723 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747366960.586688 1160723 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747366960.607277 1160723 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747366960.607300 1160723 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747366960.607303 1160723 computation_placer.cc:177] computation placer alr

INFO 05-16 03:42:48 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-16 03:42:49 [__init__.py:239] Automatically detected platform cuda.
[2025-05-16 03:42:55,873] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
base_path = "/home/ubuntu/fast_llm_inference"
backends = ["vllm", "llama.cpp", "huggingface"]
tasks    = ["sql"]
server_rps      = [1, 2, 4, 8]
run_time        = 120.0     # seconds
sample_interval = 0.05      # s
max_batch_size  = 64        # cap per batch

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",
            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
    else:
        models = [
            "gemma-2-2b-it",
            "llama-3.1-8B-Instruct",
            "llama-3.2-3b-instruct",
            "llama-3.2-1b-instruct",
            "Qwen2.5-7B-Instruct",
            "Qwen2.5-3B-Instruct",
            "Qwen2.5-1.5B-Instruct",
            "Qwen2.5-0.5B-Instruct",
        ]
        if backend != "vllm":
            models.append("gemma-2-9b-it") # too large for vllm

    for model in models:
        for task in tasks:
            for rps in server_rps:
                export_path = f"{base_path}/results/{backend}_{model}_{task}_{rps}QPS_{int(run_time)}s_server.csv"
                print(f"→ {backend} | {model} | {task} @ {rps} QPS for {run_time}s -> {export_path}")
                run_benchmark(
                    backend=backend,
                    model_name=model,
                    task=task,
                    base_path=base_path,
                    scenario="server",
                    run_time=run_time,
                    requests_per_sec=rps,
                    sample_interval=sample_interval,
                    max_batch_size=max_batch_size,
                    export_path=export_path,
                    verbose=False
                )

: 

part 2 - batch processing

In [None]:
base_path = "/home/ubuntu/fast_llm_inference"
backends  = ["huggingface"] #"vllm", "llama.cpp", 
tasks     = ["summarization"]
batch_sizes = [1, 8, 16, 32, 64]   # ← as requested
sample_interval = 0.05             # s
max_batch_size  = 64               # keep the same cap

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",
            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
    else:
        models = [
            #"gemma-2-2b-it",
            "llama-3.1-8B-Instruct",
            "llama-3.2-3b-instruct",
            "llama-3.2-1b-instruct",
            "Qwen2.5-7B-Instruct",
            "Qwen2.5-3B-Instruct",
            "Qwen2.5-1.5B-Instruct",
            "Qwen2.5-0.5B-Instruct",
        ]
        if backend != "vllm":
            models.append("gemma-2-9b-it")   # too large for vllm

    for model in models:
        for task in tasks:
            for bs in batch_sizes:
                export_path = (
                    f"{base_path}/results/"
                    f"{backend}_{model}_{task}_{bs}batch.csv"
                )
                print(f"→ {backend} | {model} | {task} @ batch={bs} -> {export_path}")
                run_benchmark(
                    backend=backend,
                    model_name=model,
                    task=task,
                    base_path=base_path,
                    scenario="batch",
                    batch_size=bs,
                    sample_interval=sample_interval,
                    max_batch_size=max_batch_size,
                    export_path=export_path,
                    verbose=False,
                )


In [1]:
from benchmark.backends.vllm_backend import VLLMBackend

model = VLLMBackend("/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit")
model.load_model()

INFO 05-19 08:54:15 [__init__.py:239] Automatically detected platform cuda.


2025-05-19 08:54:16.711341: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747644856.737085   58511 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747644856.744882   58511 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747644856.767440   58511 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747644856.767460   58511 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747644856.767463   58511 computation_placer.cc:177] computation placer alr

INFO 05-19 08:54:35 [config.py:717] This model supports multiple tasks: {'score', 'generate', 'embed', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 05-19 08:54:37 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-19 08:54:38 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', speculative_config=None, tokenizer='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-19 08:54:45 [gpu_model_runner.py:1347] Model loading took 5.3132 GiB and 4.338470 seconds
INFO 05-19 08:54:55 [backends.py:420] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/a89f85ea99/rank_0_0 for vLLM's torch.compile
INFO 05-19 08:54:55 [backends.py:430] Dynamo bytecode transform time: 9.70 s
INFO 05-19 08:55:04 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 6.534 s
INFO 05-19 08:55:07 [monitor.py:33] torch.compile takes 9.70 s in total
INFO 05-19 08:55:10 [kv_cache_utils.py:634] GPU KV cache size: 106,880 tokens
INFO 05-19 08:55:10 [kv_cache_utils.py:637] Maximum concurrency for 8,192 tokens per request: 13.05x
INFO 05-19 08:56:21 [gpu_model_runner.py:1686] Graph capturing finished in 71 secs, took 1.54 GiB
INFO 05-19 08:56:21 [core.py:159] init engine (profile, create kv cache, warmup model) took 95.70 seconds
INFO 05-19 08:56:21 [core_client.py:439] Core engine process 0 ready.


In [None]:
model.generate("What is the purpose of life?", perplexity=True)

In [None]:
# Cell 1: Imports and setup
import os
import math
from vllm import LLM, SamplingParams

# (Optional) adjust your model path here
MODEL_PATH = "/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit"

# Cell 2: Load model and define prompts
model = LLM(
    model=MODEL_PATH,
    trust_remote_code=True,
    gpu_memory_utilization=0.9,
    max_model_len=4096,
)

prompts = [
    "The quick brown fox jumps over the lazy dog.",
    "What is the purpose of life?"
]


INFO 05-18 11:26:48 [__init__.py:239] Automatically detected platform cuda.


2025-05-18 11:26:49.212117: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747567609.229561   12031 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747567609.235248   12031 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747567609.250681   12031 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747567609.250693   12031 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747567609.250695   12031 computation_placer.cc:177] computation placer alr

In [None]:
# Cell 3: Configure SamplingParams for logprobs & perplexity
params = SamplingParams(
    temperature=0.1,
    max_tokens=32,
    logprobs=1,
    prompt_logprobs=1
)

# Cell 4: Run generation and display results in a table
outputs = model.generate(prompts, params)

for i, gen_out in enumerate(outputs):
    sample  = gen_out.outputs[0]
    text    = sample.text.lstrip()
    lp_dict = sample.logprobs

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [None]:
lp_dict

[{2209: Logprob(logprob=-1.4004144668579102, rank=1, decoded_token='ĠIs')},
 {433: Logprob(logprob=-0.17977960407733917, rank=1, decoded_token='Ġit')},
 {311: Logprob(logprob=-0.0869283527135849, rank=1, decoded_token='Ġto')},
 {1505: Logprob(logprob=-0.9659126996994019, rank=1, decoded_token='Ġfind')},
 {23871: Logprob(logprob=-0.04803086444735527, rank=1, decoded_token='Ġhappiness')},
 {11: Logprob(logprob=-0.058653172105550766, rank=1, decoded_token=',')},
 {311: Logprob(logprob=-0.8425228595733643, rank=1, decoded_token='Ġto')},
 {11322: Logprob(logprob=-0.7215710878372192, rank=1, decoded_token='Ġachieve')},
 {2450: Logprob(logprob=-0.043824948370456696, rank=1, decoded_token='Ġsuccess')},
 {11: Logprob(logprob=-0.0013250865740701556, rank=1, decoded_token=',')},
 {477: Logprob(logprob=-0.7743627429008484, rank=2, decoded_token='Ġor'),
  311: Logprob(logprob=-0.6181127429008484, rank=1, decoded_token='Ġto')},
 {311: Logprob(logprob=-0.031453102827072144, rank=1, decoded_token='Ġto

In [None]:
# Cell 4: Run generation and display results in a table plus sequence PPL
outputs = model.generate(prompts, params)

for i, gen_out in enumerate(outputs):
    sample    = gen_out.outputs[0]
    text      = sample.text.lstrip()
    lp_list   = sample.logprobs            # list of dicts
    token_ids = sample.token_ids

    # 1) Extract the chosen-token strings & logprobs
    tokens, logps = [], []
    for entry in lp_list:
        # each entry is {token_id: Logprob(...), ...}
        for tid, lp_obj in entry.items():
            if lp_obj.rank == 1:
                tokens.append(lp_obj.decoded_token)
                logps.append(lp_obj.logprob)
                break

    # 2) Compute per-token perplexity
    ppl = [math.exp(-lp) for lp in logps]

    # 3) Print per-token table
    print(f"\n=== Prompt {i+1}: {prompts[i]} ===")
    print(f"Generated: {text}\n")
    print(f"{'Token':>12} | {'LogProb':>8} | {'PPL':>8}")
    print("-" * 34)
    for tok, lp, p in zip(tokens, logps, ppl):
        print(f"{tok:>12} | {lp:8.4f} | {p:8.4f}")

    # 4) Compute sequence-level perplexity
    ppl_seq = math.exp(- sum(logps) / len(logps))
    print(f"\nSequence-level Perplexity: {ppl_seq:.4f}")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


=== Prompt 1: The quick brown fox jumps over the lazy dog. ===
Generated: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick

       Token |  LogProb |      PPL
----------------------------------
        ĠThe |  -0.9727 |   2.6450
      Ġquick |  -0.3296 |   1.3905
      Ġbrown |  -0.0029 |   1.0029
        Ġfox |  -0.0016 |   1.0016
      Ġjumps |  -0.0055 |   1.0055
       Ġover |  -0.0005 |   1.0005
        Ġthe |  -0.0005 |   1.0005
       Ġlazy |  -0.0005 |   1.0005
        Ġdog |  -0.0004 |   1.0004
           . |  -0.3360 |   1.3993
        ĠThe |  -0.0372 |   1.0379
      Ġquick |  -0.0015 |   1.0015
      Ġbrown |  -0.0012 |   1.0012
        Ġfox |  -0.0007 |   1.0007
      Ġjumps |  -0.0013 |   1.0013
       Ġover |  -0.0008 |   1.0008
        Ġthe |  -0.0010 |   1.0010
       Ġlazy |  -0.0010 |   1.0010
        Ġdog |  -0.0005 |   1.0005
           . |  -0.2609 |   1.2981
        Ġ

In [1]:
from benchmark.benchmark import ModelBenchmark
import torch

bm = ModelBenchmark(
    backend="huggingface",
    model_name="llama-3.2-3b-instruct",
    model_path="/home/ubuntu/fast_llm_inference/models/llama-3.2-3b-instruct",
    base_path="/home/ubuntu/fast_llm_inference/",
    verbose=False
)

INFO 05-25 11:04:42 [__init__.py:239] Automatically detected platform cuda.


2025-05-25 11:04:42.809369: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748171082.830786   43313 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748171082.836811   43313 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748171082.853168   43313 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748171082.853191   43313 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748171082.853193   43313 computation_placer.cc:177] computation placer alr

In [2]:
run_report, details_df = bm.run(
    task="summarization",
    scenario="batch",
    samples=16,
    batch_size=8,
    sample_interval=0.1,
    quality_metric=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


=== Batch Run Report ===
   startup_time_sec  load_model_time_sec  ttft_sec  cold_start_sec  \
0            0.0028               3.6709    0.1596          3.8332   

              model_name  model_size_mb      backend  batch_size  num_queries  \
0  llama-3.2-3b-instruct    6138.690614  huggingface           8           16   

   batch_id  ...   avg_ATL    avg_GL  avg_TPS  avg_SPS  avg_energy_per_token  \
0         0  ...  0.044919  2.943486    22.27  1.08375              3.162506   

   avg_energy_per_sentence  avg_estimated_cost_usd  avg_ROUGE-1  avg_ROUGE-2  \
0                66.314669                0.002354     0.362087     0.118788   

   avg_ROUGE-L  
0     0.224013  

[1 rows x 33 columns]

=== Per-Query Details ===
   batch_id                                             prompt  \
0         0  ### SYSTEM\nYou are a news-summarization assis...   
1         0  ### SYSTEM\nYou are a news-summarization assis...   
2         0  ### SYSTEM\nYou are a news-summarization assis...   
3

In [None]:
run_report.mean(numeric_only=True)

startup_time_sec              0.002800
load_model_time_sec           3.670900
ttft_sec                      0.159600
cold_start_sec                3.833200
model_size_mb              6138.690614
batch_size                    8.000000
num_queries                  16.000000
batch_id                      0.000000
batch_time_s                 24.550367
batch_tokens                537.000000
batch_sentences              25.000000
avg_gpu_mem_mb             7213.020000
peak_gpu_mem_mb            7237.880000
overhead_mb                1099.190000
avg_gpu_util_pct             90.580000
peak_gpu_util_pct           100.000000
avg_cpu_util_pct              6.220000
peak_cpu_util_pct             6.840000
avg_power_w                  70.370000
peak_power_w                 73.810000
total_energy_wh               0.479924
avg_ATL                       0.044919
avg_GL                        2.943486
avg_TPS                      22.270000
avg_SPS                       1.083750
avg_energy_per_token     

: 

In [4]:
details_df

Unnamed: 0,batch_id,prompt,generated_answer,reference_answer,ATL,GL,TPS,SPS,energy_per_token,energy_per_sentence,estimated_cost_usd,ROUGE-1,ROUGE-2,ROUGE-L
0,0,### SYSTEM\nYou are a news-summarization assis...,President Barack Obama honored the New England...,Brady cited 'prior family commitments' in bowi...,0.049031,3.236063,20.4,1.24,3.443882,73.974595,0.002632,0.208333,0.06383,0.125
1,0,### SYSTEM\nYou are a news-summarization assis...,Rangers FC has promised to investigate claims ...,Reports emerged on social media suggesting Mi...,0.049031,3.138,20.4,0.96,3.443882,73.974595,0.002632,0.333333,0.09434,0.185185
2,0,### SYSTEM\nYou are a news-summarization assis...,"Adam Gadahn, a 36-year-old American who was kn...","In his final known video, Adam Gadahn called f...",0.049031,2.794781,20.4,1.07,3.443882,73.974595,0.002632,0.462963,0.188679,0.314815
3,0,### SYSTEM\nYou are a news-summarization assis...,Former Liverpool players John Barnes and Jamie...,John Barnes appeared as a guest on Sky's A Lea...,0.049031,3.824438,20.4,0.78,3.443882,73.974595,0.002632,0.414815,0.180451,0.237037
4,0,### SYSTEM\nYou are a news-summarization assis...,John Higgins and Graeme Dott led the Scottish ...,John Higgins beats Robert Milkins 10-5 to reac...,0.049031,3.285094,20.4,0.91,3.443882,73.974595,0.002632,0.327586,0.140351,0.155172
5,0,### SYSTEM\nYou are a news-summarization assis...,Arsenal will wear their yellow and blue away s...,Arsenal face Aston Villa in the FA Cup final a...,0.049031,3.579281,20.4,0.84,3.443882,73.974595,0.002632,0.59854,0.266667,0.262774
6,0,### SYSTEM\nYou are a news-summarization assis...,Liverpool advanced to the FA Cup semi-finals w...,Coutinho hit the only goal of the game as Live...,0.049031,3.285094,20.4,0.91,3.443882,73.974595,0.002632,0.367347,0.068966,0.217687
7,0,### SYSTEM\nYou are a news-summarization assis...,West Coast Shaving created a project blending ...,Members of 30 rock band members merged into on...,0.049031,3.187031,20.4,0.94,3.443882,73.974595,0.002632,0.395833,0.106383,0.291667
8,1,### SYSTEM\nYou are a news-summarization assis...,A young mother was humiliated after a dental a...,Mother-of-one Tayler Chaice Buzbee called Aspe...,0.044328,3.368903,22.56,1.19,3.125693,63.889157,0.002265,0.440678,0.091429,0.20339
9,1,### SYSTEM\nYou are a news-summarization assis...,"A top breeder in Devon, Tony Tancock, had 12 p...","Heart-broken Tony Tancock, 56, has won prizes ...",0.044328,2.48235,22.56,1.21,3.125693,63.889157,0.002265,0.389381,0.144144,0.265487


In [3]:
run_report2, details_df2 = bm.run(
    task="qa",
    scenario="server",
    run_time=60.0,
    requests_per_sec=5.0,
    max_batch_size=64,
    sample_interval=0.1,
    quality_metric=False
)

bm.close()
del bm
torch.cuda.empty_cache()  # clear GPU memory after closing the benchmark

print("=== Server Run Report ===")
print(run_report2)
print("\n=== Per-Query Details ===")
print(details_df2.head())

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [None]:
run_report2

Unnamed: 0,startup_time_sec,load_model_time_sec,ttft_sec,cold_start_sec,model_name,model_size_mb,backend,scenario,batch_size,num_queries,...,peak_power_w,total_energy_wh,avg_scheduled_ts,avg_wait_time_s,avg_ATL,avg_GL,avg_TPS,avg_SPS,avg_energy_per_token,avg_energy_per_sentence
0,0.5278,79.8238,0.0769,80.4285,llama-3.1-8B-Instruct-4bit,15327.360256,vllm,server,,,...,72.27,0.110956,29.054802,1748086000.0,0.028772,0.091197,34.833268,19.409052,2.015119,6.219429


In [None]:
details_df2

Unnamed: 0,batch_id,prompt,generated_answer,reference_answer,scheduled_ts,wait_time_s,ATL,GL,TPS,SPS,energy_per_token,energy_per_sentence
0,1,### SYSTEM\nYou are a question-answering assis...,Lothar de Maizière,"[Lothar de Maizière, Lothar de Maizière, Lotha...",0.093854,1.748086e+09,0.026505,0.079514,37.73,12.58,1.767446,6.241295
1,1,### SYSTEM\nYou are a question-answering assis...,Complexity classes.,"[complexity classes, complexity classes, some ...",0.695878,1.748086e+09,0.026505,0.053010,37.73,18.86,1.767446,6.241295
2,1,### SYSTEM\nYou are a question-answering assis...,GTE,[Telenet was incorporated in 1973 and started ...,0.959227,1.748086e+09,0.026505,0.026505,37.73,37.73,1.767446,6.241295
3,1,### SYSTEM\nYou are a question-answering assis...,Water flow through the body cavity.,"[water flow through the body cavity, κτείς kte...",1.141816,1.748086e+09,0.026505,0.159029,37.73,6.29,1.767446,6.241295
4,1,### SYSTEM\nYou are a question-answering assis...,12 May 1705.,"[12 May 1705, 1705, 12 May 1705]",1.175741,1.748086e+09,0.026505,0.079514,37.73,12.58,1.767446,6.241295
...,...,...,...,...,...,...,...,...,...,...,...,...
301,5,### SYSTEM\nYou are a question-answering assis...,Jean Ribault,"[Jean Ribault, Jean Ribault, Jean Ribault]",58.529722,1.748086e+09,0.028178,0.056356,35.49,17.74,1.981041,6.120001
302,5,### SYSTEM\nYou are a question-answering assis...,Pinedale,"[Pinedale, Pinedale, Pinedale]",58.685304,1.748086e+09,0.028178,0.028178,35.49,35.49,1.981041,6.120001
303,5,### SYSTEM\nYou are a question-answering assis...,Spontaneous combustion.,"[spontaneous, spontaneous combustion, spontane...",58.888211,1.748086e+09,0.028178,0.056356,35.49,17.74,1.981041,6.120001
304,5,### SYSTEM\nYou are a question-answering assis...,Vistula Valley,"[Vistula Valley, geomorphologic, Vistula Valley]",59.147203,1.748086e+09,0.028178,0.056356,35.49,17.74,1.981041,6.120001
