### RQ1 - Comparing different quantization levels

In [3]:
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(backend, model_name, task, base_path, samples=500, verbose=False, batch_size=100):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/models/{model_name}",
            base_path=base_path,
            task=task,
            verbose=verbose,
        )
        bm.run(samples=samples, batch_size=batch_size)
        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task}")
    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} -- {e}")
        torch.cuda.empty_cache()  # ensure no memory leak on error

[2025-05-12 06:59:57,327] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
INFO 05-12 06:59:58 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-12 06:59:58 [__init__.py:239] Automatically detected platform cuda.


In [4]:
base_path = "/home/rag/fast_llm_inference/"

backends = ["vllm"] #, "huggingface","deepspeed_mii", "llama.cpp"]
models   = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit", # some weird error
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",


    "gemma-2-9b-it-bnb4",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it", # too large
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]

tasks    = ["summarization", "qa", "sql",]

first run

In [None]:
for backend in backends:
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=100,
            )

check if anything is missing

In [6]:
import os

# Define your parameters
backends = ["vllm"]
models = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",
    "gemma-2-9b-it-4bit",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it",
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]
tasks = ["summarization", "qa", "sql"]

results_dir = "./results/experiment_1/"

missing_models = set()

for backend in backends:
    for model in models:
        for task in tasks:
            filename = f"{backend}_{model}_{task}.csv"
            filepath = os.path.join(results_dir, filename)
            if not os.path.exists(filepath):
                missing_models.add(model)
                break  # No need to check more tasks if one is missing

# Print models with missing files
if missing_models:
    print("Models with missing files:")
    for model in sorted(missing_models):
        print(model)
else:
    print("✅ All models are complete.")

Models with missing files:
Qwen2.5-1.5B-Instruct-4bit
Qwen2.5-3B-Instruct-4bit
Qwen2.5-7B-Instruct-4bit
Qwen2.5-7B-Instruct-8bit
gemma-2-9b-it-4bit
gemma-2-9b-it-8bit
llama-3.1-8B-Instruct-4bit
llama-3.2-3b-instruct-4bit


try again with the missing models

In [None]:
for backend in backends:
    for model in list(missing_models):
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=500,
                verbose=False,
                batch_size=100,
            )

INFO 05-12 07:04:31 [__init__.py:239] Automatically detected platform cuda.
INFO 05-12 07:04:34 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/rag/fast_llm_inference//models/Qwen2.5-7B-Instruct-8bit', speculative_config=None, tokenizer='/home/rag/fast_llm_inference//models/Qwen2.5-7B-Instruct-8bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), se

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.37it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.62it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.57it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.33it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.58it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.54it/s]



INFO 05-12 07:04:38 [gpu_model_runner.py:1347] Model loading took 8.1424 GiB and 2.707208 seconds




INFO 05-12 07:04:40 [kv_cache_utils.py:634] GPU KV cache size: 194,432 tokens
INFO 05-12 07:04:40 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 5.93x
INFO 05-12 07:04:40 [core.py:159] init engine (profile, create kv cache, warmup model) took 2.55 seconds
INFO 05-12 07:04:40 [core_client.py:439] Core engine process 0 ready.


Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Stats for Qwen2.5-7B-Instruct-8bit on vllm/sql:
prompt_length                        1201.928000 ± 296.321737
TTFT                                      0.145320 ± 0.007999
ATL                                       0.870983 ± 0.576903
GL                                       10.709580 ± 0.371103
TPS                                       1.601360 ± 0.840256
SPS                                       0.314820 ± 0.277602
Avg GPU Mem (MB)                    23475.730000 ± 142.954940
Peak GPU Mem (MB)                    23511.160000 ± 92.892939
Avg GPU Util (%)                         54.116000 ± 2.713727
Peak GPU Util (%)                        99.000000 ± 0.000000
Total Energy (Wh)                         0.296732 ± 0.004935
Avg Power (W)                            99.840000 ± 3.102569
Peak Power (W)                          177.940000 ± 2.034009
Energy per Token (J/token)              86.825554 ± 57.317674
Energy per Sentence (J/sentence)      691.601725 ± 443.435410
Memory Usage (MB)     



✅ Completed: Qwen2.5-7B-Instruct-8bit | vllm | sql
Running benchmark for llama-3.2-3b-instruct-4bit with vllm on summarization
INFO 05-12 07:05:57 [config.py:717] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 05-12 07:05:57 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-12 07:06:01 [__init__.py:239] Automatically detected platform cuda.
INFO 05-12 07:06:03 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/rag/fast_llm_inference//models/llama-3.2-3b-instruct-4bit', speculative_config=None, tokenizer='/home/rag/fast_llm_inference//models/llama-3.2-3b-instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_si

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.45it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.45it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.20it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.20it/s]



INFO 05-12 07:06:05 [gpu_model_runner.py:1347] Model loading took 2.1269 GiB and 1.199941 seconds
INFO 05-12 07:06:13 [backends.py:420] Using cache directory: /home/rag/.cache/vllm/torch_compile_cache/802b26bcda/rank_0_0 for vLLM's torch.compile
INFO 05-12 07:06:13 [backends.py:430] Dynamo bytecode transform time: 7.96 s
INFO 05-12 07:06:19 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 5.730 s


Exception in thread Thread-10 (_metrics_monitor):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/home/rag/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/home/rag/fast_llm_inference/benchmark/benchmark.py", line 80, in _metrics_monitor
    readings["power"].append(self._get_gpu_power_usage())
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/rag/fast_llm_inference/benchmark/benchmark.py", line 62, in _get_gpu_power_usage
    power_mw = nvmlDeviceGetPowerUsage(self.handle)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/rag/.venv/lib/python3.11/site-packages/pynvml.py", line 3486, in nvmlDeviceGetPowerUsage
    _nvmlCheckReturn(ret)
  File "/home/rag/.venv/lib/python3.11/site-packages

INFO 05-12 07:06:25 [monitor.py:33] torch.compile takes 7.96 s in total
INFO 05-12 07:06:25 [kv_cache_utils.py:634] GPU KV cache size: 164,096 tokens
INFO 05-12 07:06:25 [kv_cache_utils.py:637] Maximum concurrency for 131,072 tokens per request: 1.25x
INFO 05-12 07:06:53 [gpu_model_runner.py:1686] Graph capturing finished in 28 secs, took 1.21 GiB
INFO 05-12 07:06:53 [core.py:159] init engine (profile, create kv cache, warmup model) took 47.89 seconds
INFO 05-12 07:06:53 [core_client.py:439] Core engine process 0 ready.


Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[2025-05-12 07:07:22] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:22] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:22] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:22] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:22] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:22] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:07:23] INFO rouge_scorer.py:83: Using default tok

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[2025-05-12 07:08:27] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:28] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:29] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:29] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:29] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:29] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:08:29] INFO rouge_scorer.py:83: Using default tok

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[2025-05-12 07:09:32] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:33] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:34] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:34] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:34] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:34] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:09:34] INFO rouge_scorer.py:83: Using default tok

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[2025-05-12 07:10:38] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:38] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:38] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:38] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:38] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:38] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:38] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:39] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:39] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:39] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:39] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:39] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:39] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:10:39] INFO rouge_scorer.py:83: Using default tok

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[2025-05-12 07:11:43] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:43] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:43] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:43] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:43] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:43] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tokenizer.
[2025-05-12 07:11:44] INFO rouge_scorer.py:83: Using default tok

Stats for llama-3.2-3b-instruct-4bit on vllm/summarization:
prompt_length                       5387.808000 ± 1970.082684
TTFT                                      0.069840 ± 0.000734
ATL                                       0.464657 ± 0.103959
GL                                       24.034520 ± 0.334911
TPS                                       2.266560 ± 0.595230
SPS                                       0.159560 ± 0.029590
Avg GPU Mem (MB)                      23435.446000 ± 0.228228
Peak GPU Mem (MB)                     23435.560000 ± 0.000000
Avg GPU Util (%)                         95.626000 ± 0.204518
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         1.048984 ± 0.018107
Avg Power (W)                           157.116000 ± 0.951270
Peak Power (W)                          185.734000 ± 6.375286
Energy per Token (J/token)              73.018018 ± 16.397062
Energy per Sentence (J/sentence)     1020.849448 ± 185.338261
Memory Usa



✅ Completed: llama-3.2-3b-instruct-4bit | vllm | summarization
Running benchmark for llama-3.2-3b-instruct-4bit with vllm on qa
INFO 05-12 07:12:26 [config.py:717] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 05-12 07:12:26 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-12 07:12:30 [__init__.py:239] Automatically detected platform cuda.
INFO 05-12 07:12:32 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/rag/fast_llm_inference//models/llama-3.2-3b-instruct-4bit', speculative_config=None, tokenizer='/home/rag/fast_llm_inference//models/llama-3.2-3b-instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_s

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.54it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.54it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.32it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.32it/s]



INFO 05-12 07:12:35 [gpu_model_runner.py:1347] Model loading took 2.1269 GiB and 1.238727 seconds
INFO 05-12 07:12:43 [backends.py:420] Using cache directory: /home/rag/.cache/vllm/torch_compile_cache/802b26bcda/rank_0_0 for vLLM's torch.compile
INFO 05-12 07:12:43 [backends.py:430] Dynamo bytecode transform time: 8.73 s
INFO 05-12 07:12:49 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 5.589 s
INFO 05-12 07:12:54 [monitor.py:33] torch.compile takes 8.73 s in total
INFO 05-12 07:12:54 [kv_cache_utils.py:634] GPU KV cache size: 164,096 tokens
INFO 05-12 07:12:54 [kv_cache_utils.py:637] Maximum concurrency for 131,072 tokens per request: 1.25x
INFO 05-12 07:13:22 [gpu_model_runner.py:1686] Graph capturing finished in 28 secs, took 1.21 GiB
INFO 05-12 07:13:22 [core.py:159] init engine (profile, create kv cache, warmup model) took 47.68 seconds
INFO 05-12 07:13:22 [core_client.py:439] Core engine process 0 ready.


Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Stats for llama-3.2-3b-instruct-4bit on vllm/qa:
prompt_length                       1376.792000 ± 308.648558
TTFT                                     0.078380 ± 0.013088
ATL                                      1.898850 ± 1.032705
GL                                       3.060620 ± 0.400142
TPS                                      0.861760 ± 0.980049
SPS                                      0.505300 ± 0.188440
Avg GPU Mem (MB)                     23435.270000 ± 0.580581
Peak GPU Mem (MB)                    23435.560000 ± 0.000000
Avg GPU Util (%)                        86.552000 ± 4.107886
Peak GPU Util (%)                      100.000000 ± 0.000000
Total Energy (Wh)                        0.126989 ± 0.017909
Avg Power (W)                          149.112000 ± 3.313072
Peak Power (W)                        183.004000 ± 12.252346
Energy per Token (J/token)           283.686501 ± 155.286853
Energy per Sentence (J/sentence)     338.252621 ± 123.123354
Memory Usage (MB)                   



✅ Completed: llama-3.2-3b-instruct-4bit | vllm | qa
Running benchmark for llama-3.2-3b-instruct-4bit with vllm on sql
INFO 05-12 07:13:44 [config.py:717] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 05-12 07:13:44 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-12 07:13:48 [__init__.py:239] Automatically detected platform cuda.
INFO 05-12 07:13:51 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/rag/fast_llm_inference//models/llama-3.2-3b-instruct-4bit', speculative_config=None, tokenizer='/home/rag/fast_llm_inference//models/llama-3.2-3b-instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, dis

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.53it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.53it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.19it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.19it/s]



INFO 05-12 07:13:53 [gpu_model_runner.py:1347] Model loading took 2.1269 GiB and 1.300287 seconds
INFO 05-12 07:14:00 [backends.py:420] Using cache directory: /home/rag/.cache/vllm/torch_compile_cache/802b26bcda/rank_0_0 for vLLM's torch.compile
INFO 05-12 07:14:00 [backends.py:430] Dynamo bytecode transform time: 7.60 s
INFO 05-12 07:14:08 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 6.753 s
INFO 05-12 07:14:13 [monitor.py:33] torch.compile takes 7.60 s in total
INFO 05-12 07:14:14 [kv_cache_utils.py:634] GPU KV cache size: 164,096 tokens
INFO 05-12 07:14:14 [kv_cache_utils.py:637] Maximum concurrency for 131,072 tokens per request: 1.25x
INFO 05-12 07:14:42 [gpu_model_runner.py:1686] Graph capturing finished in 29 secs, took 1.21 GiB
INFO 05-12 07:14:42 [core.py:159] init engine (profile, create kv cache, warmup model) took 49.57 seconds
INFO 05-12 07:14:42 [core_client.py:439] Core engine process 0 ready.


Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/100 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Stats for llama-3.2-3b-instruct-4bit on vllm/sql:
prompt_length                       1201.928000 ± 296.321737
TTFT                                     0.070320 ± 0.001595
ATL                                      0.518158 ± 0.766214
GL                                       5.527560 ± 0.037272
TPS                                      3.326220 ± 1.812227
SPS                                      0.636620 ± 0.550298
Avg GPU Mem (MB)                     23433.550000 ± 0.020020
Peak GPU Mem (MB)                    23433.560000 ± 0.000000
Avg GPU Util (%)                        92.284000 ± 2.753002
Peak GPU Util (%)                      100.000000 ± 0.000000
Total Energy (Wh)                        0.242757 ± 0.001850
Avg Power (W)                          158.110000 ± 1.824043
Peak Power (W)                         176.438000 ± 6.182558
Energy per Token (J/token)            81.987809 ± 121.745986
Energy per Sentence (J/sentence)     552.859421 ± 367.235474
Memory Usage (MB)                  



✅ Completed: llama-3.2-3b-instruct-4bit | vllm | sql
Running benchmark for Qwen2.5-1.5B-Instruct-4bit with vllm on summarization
INFO 05-12 07:15:23 [config.py:717] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 05-12 07:15:23 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-12 07:15:27 [__init__.py:239] Automatically detected platform cuda.
INFO 05-12 07:15:29 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/rag/fast_llm_inference//models/Qwen2.5-1.5B-Instruct-4bit', speculative_config=None, tokenizer='/home/rag/fast_llm_inference//models/Qwen2.5-1.5B-Instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_s

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:06<00:00,  6.56s/it]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:06<00:00,  6.56s/it]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.46it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.46it/s]



INFO 05-12 07:15:38 [gpu_model_runner.py:1347] Model loading took 1.0961 GiB and 7.170091 seconds
INFO 05-12 07:15:47 [backends.py:420] Using cache directory: /home/rag/.cache/vllm/torch_compile_cache/0830c47310/rank_0_0 for vLLM's torch.compile
INFO 05-12 07:15:47 [backends.py:430] Dynamo bytecode transform time: 9.64 s
INFO 05-12 07:15:51 [backends.py:136] Cache the graph of shape None for later use
INFO 05-12 07:16:23 [backends.py:148] Compiling a graph for general shape takes 35.78 s
INFO 05-12 07:16:39 [monitor.py:33] torch.compile takes 45.42 s in total
INFO 05-12 07:16:40 [kv_cache_utils.py:634] GPU KV cache size: 689,904 tokens
INFO 05-12 07:16:40 [kv_cache_utils.py:637] Maximum concurrency for 32,768 tokens per request: 21.05x
INFO 05-12 07:17:08 [gpu_model_runner.py:1686] Graph capturing finished in 28 secs, took 1.17 GiB
INFO 05-12 07:17:08 [core.py:159] init engine (profile, create kv cache, warmup model) took 90.32 seconds
INFO 05-12 07:17:08 [core_client.py:439] Core engi

### RQ2 - Comparing different inference engines

In [None]:
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(backend, model_name, task, base_path="/home/ubuntu/fast_llm_inference/models", samples=500, verbose=False, batch_size=100):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/{model_name}",
            task=task,
            verbose=verbose,
        )
        bm.run(samples=samples, batch_size=batch_size)
        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task}")
    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} -- {e}")
        torch.cuda.empty_cache()  # ensure no memory leak on error


base_path = "/home/ubuntu/fast_llm_inference/models"

backends = ["huggingface"] #"llama.cpp"] #"vllm" ,"deepspeed_mii", "huggingface"]

models   = [
    "gemma-2-9b-it", 
    "gemma-2-2b-it",

    "llama-3.1-8B-Instruct",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
]

tasks    = ["summarization", "qa", "sql",]

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",

            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
        
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=20,
            )

2025-05-11 01:47:22.335784: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746928042.686955  715944 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746928042.790989  715944 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746928043.633385  715944 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746928043.633515  715944 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746928043.633520  715944 computation_placer.cc:177] computation placer alr

INFO 05-11 01:47:36 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-11 01:47:36 [__init__.py:239] Automatically detected platform cuda.
[2025-05-11 01:47:43,255] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Running benchmark for qwen2.5-3b-instruct-fp16.gguf with llama.cpp on summarization


llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


Stats for qwen2.5-3b-instruct-fp16.gguf on llama.cpp/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.037240 ± 0.002196
ATL                                       1.907586 ± 0.404739
GL                                      103.282380 ± 6.380167
TPS                                       0.548400 ± 0.119880
SPS                                       0.034100 ± 0.007797
Avg GPU Mem (MB)                       7188.030000 ± 5.935774
Peak GPU Mem (MB)                      7189.480000 ± 6.407067
Avg GPU Util (%)                         91.788000 ± 0.394477
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         2.051205 ± 0.128664
Avg Power (W)                            71.492000 ± 0.212099
Peak Power (W)                           74.938000 ± 0.328658
Energy per Token (J/token)             136.380099 ± 28.955550
Energy per Sentence (J/sentence)     2268.556692 ± 555.147749
Me

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


Stats for qwen2.5-3b-instruct-fp16.gguf on llama.cpp/qa:
prompt_length                       1393.740000 ± 302.397244
TTFT                                     0.035200 ± 0.000360
ATL                                    24.702549 ± 11.375563
GL                                      35.124600 ± 0.881287
TPS                                      0.063900 ± 0.071264
SPS                                      0.031200 ± 0.005908
Avg GPU Mem (MB)                      7167.946000 ± 2.506683
Peak GPU Mem (MB)                     7168.680000 ± 2.412091
Avg GPU Util (%)                        91.738000 ± 0.576138
Peak GPU Util (%)                       95.000000 ± 0.000000
Total Energy (Wh)                        0.698264 ± 0.018237
Avg Power (W)                           71.566000 ± 0.394590
Peak Power (W)                          74.466000 ± 0.552701
Energy per Token (J/token)          1768.265073 ± 815.255899
Energy per Sentence (J/sentence)    2463.341444 ± 255.088205
Memory Usage (MB)           

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


Stats for qwen2.5-3b-instruct-fp16.gguf on llama.cpp/sql:
prompt_length                        1163.740000 ± 264.891883
TTFT                                      0.035960 ± 0.000972
ATL                                       4.107402 ± 3.616989
GL                                       56.953800 ± 5.185162
TPS                                       0.400000 ± 0.247055
SPS                                       0.096800 ± 0.069904
Avg GPU Mem (MB)                       7165.032000 ± 1.042286
Peak GPU Mem (MB)                      7165.080000 ± 0.984732
Avg GPU Util (%)                         91.832000 ± 0.368968
Peak GPU Util (%)                        95.000000 ± 0.000000
Total Energy (Wh)                         1.135422 ± 0.104077
Avg Power (W)                            71.766000 ± 0.182198
Peak Power (W)                           73.934000 ± 0.462475
Energy per Token (J/token)            294.793370 ± 259.616610
Energy per Sentence (J/sentence)    1812.023837 ± 1647.471307
Memory Usage