### RQ1 - Comparing different quantization levels

**Quantization Dimension**

How does quantization in different models and architectures affect system and task-specific metrics.

In [3]:
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(backend, model_name, task, base_path, samples=500, verbose=False, batch_size=100):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/models/{model_name}",
            base_path=base_path,
            task=task,
            verbose=verbose,
        )
        bm.run(samples=samples, batch_size=batch_size)
        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task}")
    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} -- {e}")
        torch.cuda.empty_cache()  # ensure no memory leak on error

2025-05-12 15:48:54.090065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747064934.110013  260255 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747064934.115811  260255 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747064934.135655  260255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747064934.135683  260255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747064934.135685  260255 computation_placer.cc:177] computation placer alr

INFO 05-12 15:48:57 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-12 15:48:57 [__init__.py:239] Automatically detected platform cuda.
[2025-05-12 15:49:00,664] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [7]:
base_path = "/home/ubuntu/fast_llm_inference/"

backends = ["vllm"] #, "huggingface","deepspeed_mii", "llama.cpp"]
models   = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit", # some weird error
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",


    "gemma-2-9b-it-bnb4",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it", # too large
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]

tasks    = ["summarization", "qa", "sql",]

first run

In [None]:
for backend in backends:
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=100,
            )

check if anything is missing

In [12]:
import os

# Define your parameters
backends = ["vllm"]
models = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",
    "gemma-2-9b-it-4bit",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it",
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]
tasks = ["summarization", "qa", "sql"]

results_dir = "./results/experiment_1/"

missing_models = set()

for backend in backends:
    for model in models:
        for task in tasks:
            filename = f"{backend}_{model}_{task}.csv"
            filepath = os.path.join(results_dir, filename)
            if not os.path.exists(filepath):
                missing_models.add(model)
                break  # No need to check more tasks if one is missing

# Print models with missing files
if missing_models:
    print("Models with missing files:")
    for model in sorted(missing_models):
        print(model)
else:
    print("✅ All models are complete.")

Models with missing files:
Qwen2.5-7B-Instruct-8bit
gemma-2-9b-it


try again with the missing models

In [None]:
for backend in backends:
    for model in list(missing_models):
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=500,
                verbose=False,
                batch_size=100,
            )

### RQ2 - Comparing different inference engines

**Framework Dimension** 

Which inference framework (Transformers, vLLM, DeepSpeed MII,172
LMDeploy, llama.cpp) strikes the best balance between system resource usage (e.g., GPU173
utilization, joules/token) and system performance (tokens/s)?174

In [None]:
base_path = "/home/ubuntu/fast_llm_inference/models"

backends = ["vllm", "huggingface", "llama.cpp"] #,"deepspeed_mii", "huggingface"]

models   = [
    "gemma-2-9b-it", 
    "gemma-2-2b-it",

    "llama-3.1-8B-Instruct",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
]

tasks    = ["summarization", "qa", "sql",]

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",

            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
        
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=20,
            )

#### RQ 3 - Comparing different use cases

**Scenario/Workload Dimension**

How do locally deployed LLMs and inference backends
perform and scale across the three dominant inference scenarios—single - stream (single user),
batched offline processing, and multi- user server workloads? Do system metrics – throughput,
GPU utilization, joules/token — evolve as the average number of queries per second varies
over time?

In [1]:
from typing import Optional
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(
    backend: str,
    model_name: str,
    task: str,
    base_path: str,
    scenario: str = "batch",            # "single", "batch", or "server"
    run_time: float = None,             # only for server: total time in seconds
    requests_per_sec: float = None,     # only for server: λ (req/s)
    batch_size: int = 100,              # only for batch
    max_batch_size: int = None,         # only for server: cap per-batch size
    sample_interval: float = 0.1,       # telemetry interval (s)
    export_path: Optional[str] = None,  # custom export path for server scenario
    verbose: bool = False
):
    print(f"Running benchmark for {model_name} with {backend} on {task} [{scenario}]")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/models/{model_name}",
            base_path=base_path,
            task=task,
            verbose=verbose,
        )

        if scenario == "server":
            assert run_time is not None,    "Must set run_time in server mode"
            assert requests_per_sec is not None, "Must set requests_per_sec in server mode"
            df = bm.run(
                scenario="server",
                run_time=run_time,
                requests_per_sec=requests_per_sec,
                sample_interval=sample_interval,
                max_batch_size=max_batch_size,
                export_path=export_path
            )

        elif scenario == "single":
            df = bm.run(
                samples=None,        # samples ignored
                batch_size=1,
                scenario="single",
                sample_interval=sample_interval,
                export_path=export_path
            )

        elif scenario == "batch":
            df = bm.run(
                samples=None,        # samples ignored
                batch_size=batch_size,
                scenario="batch",
                sample_interval=sample_interval,
                export_path=export_path
            )
        else:
            raise ValueError(f"Unknown scenario: {scenario}")

        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task} | {scenario}")
        return df

    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} | {scenario} -- {e}")
        torch.cuda.empty_cache()
        return None


2025-05-15 09:13:22.627181: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747300402.646876  908162 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747300402.653198  908162 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747300402.671978  908162 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747300402.671990  908162 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747300402.671991  908162 computation_placer.cc:177] computation placer alr

INFO 05-15 09:13:26 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-15 09:13:26 [__init__.py:239] Automatically detected platform cuda.
[2025-05-15 09:13:29,214] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
base_path = "/home/ubuntu/fast_llm_inference"
backends = ["vllm", "llama.cpp", "huggingface"]
tasks    = ["sql"]
server_rps      = [1, 2, 4, 8]
run_time        = 120.0     # seconds
sample_interval = 0.05      # s
max_batch_size  = 64        # cap per batch

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",
            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
    else:
        models = [
            "gemma-2-2b-it",
            "llama-3.1-8B-Instruct",
            "llama-3.2-3b-instruct",
            "llama-3.2-1b-instruct",
            "Qwen2.5-7B-Instruct",
            "Qwen2.5-3B-Instruct",
            "Qwen2.5-1.5B-Instruct",
            "Qwen2.5-0.5B-Instruct",
        ]
        if backend != "vllm":
            models.append("gemma-2-9b-it") # too large for vllm

    for model in models:
        for task in tasks:
            for rps in server_rps:
                export_path = f"{base_path}/results/{backend}_{model}_{task}_{rps}QPS_{int(run_time)}s_server.csv"
                print(f"→ {backend} | {model} | {task} @ {rps} QPS for {run_time}s -> {export_path}")
                run_benchmark(
                    backend=backend,
                    model_name=model,
                    task=task,
                    base_path=base_path,
                    scenario="server",
                    run_time=run_time,
                    requests_per_sec=rps,
                    sample_interval=sample_interval,
                    max_batch_size=max_batch_size,
                    export_path=export_path,
                    verbose=False
                )

Stats for llama-3.1-8B-Instruct-f16.gguf on llama.cpp/sql (server‐real‐time):
prompt_length                          1201.328358 ± 293.119243
queue_size                               207.417910 ± 44.591802
batch_size                                61.268657 ± 12.710747
wait_time                                  18.211364 ± 4.520303
response_time                           506.116948 ± 105.411269
scheduled_ts                                3.904397 ± 2.329000
start_ts                                   22.115788 ± 4.804496
GL                                      487.905603 ± 101.424678
ATL                                      79.403387 ± 116.597777
TTFT                                        0.090900 ± 0.000000
TPS                                         0.052985 ± 0.128547
SPS                                         0.006119 ± 0.027577
Avg GPU Mem (MB)                        16593.755522 ± 0.579277
Peak GPU Mem (MB)                       16593.880000 ± 0.000000
Avg GPU Util (%)          

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Stats for Llama-3.2-1B-Instruct-f16.gguf on llama.cpp/sql (server‐real‐time):
prompt_length                        1171.452381 ± 278.071431
queue_size                              21.166667 ± 13.074240
batch_size                              21.166667 ± 13.074240
wait_time                                 8.682090 ± 8.431408
response_time                           39.773515 ± 25.492254
scheduled_ts                            42.775548 ± 20.741945
start_ts                                51.457632 ± 25.468583
GL                                      31.091429 ± 18.972756
ATL                                       2.182264 ± 2.418025
TTFT                                      0.022700 ± 0.000000
TPS                                       1.875714 ± 2.863882
SPS                                       0.157857 ± 0.317175
Avg GPU Mem (MB)                       3850.460476 ± 1.086220
Peak GPU Mem (MB)                      3850.618095 ± 1.233264
Avg GPU Util (%)                         85.928571 ± 3

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Stats for Llama-3.2-1B-Instruct-f16.gguf on llama.cpp/sql (server‐real‐time):
prompt_length                        1174.679245 ± 273.968491
queue_size                              48.886792 ± 23.287978
batch_size                              47.075472 ± 21.869177
wait_time                               15.151030 ± 12.499842
response_time                           84.413990 ± 41.332346
scheduled_ts                            30.094266 ± 17.912141
start_ts                                45.245295 ± 23.623012
GL                                      69.262962 ± 31.970436
ATL                                       4.695110 ± 4.550376
TTFT                                      0.016500 ± 0.000000
TPS                                       0.862358 ± 2.167227
SPS                                       0.056887 ± 0.112896
Avg GPU Mem (MB)                       3850.757642 ± 0.978709
Peak GPU Mem (MB)                      3851.049811 ± 1.099538
Avg GPU Util (%)                         87.372264 ± 2

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Stats for Llama-3.2-1B-Instruct-f16.gguf on llama.cpp/sql (server‐real‐time):
prompt_length                        1170.253623 ± 278.600519
queue_size                            204.652174 ± 157.824982
batch_size                              59.956522 ± 14.533560
wait_time                               44.017252 ± 40.986128
response_time                          132.241839 ± 50.943881
scheduled_ts                             14.514193 ± 8.918438
start_ts                                58.531475 ± 48.639284
GL                                      88.224630 ± 21.297353
ATL                                       6.057458 ± 5.121387
TTFT                                      0.105000 ± 0.000000
TPS                                       0.529275 ± 1.414255
SPS                                       0.038623 ± 0.072647
Avg GPU Mem (MB)                       3851.022174 ± 0.925959
Peak GPU Mem (MB)                      3851.706087 ± 0.703620
Avg GPU Util (%)                         87.807899 ± 1

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Stats for Llama-3.2-1B-Instruct-f16.gguf on llama.cpp/sql (server‐real‐time):
prompt_length                        1168.659574 ± 275.843533
queue_size                            442.673759 ± 377.528514
batch_size                              59.127660 ± 15.369195
wait_time                               50.941438 ± 43.900898
response_time                          138.040221 ± 55.793783
scheduled_ts                             10.843368 ± 6.024778
start_ts                                61.784801 ± 49.174143
GL                                      87.098793 ± 22.478313
ATL                                       6.099038 ± 5.315782
TTFT                                      0.015300 ± 0.000000
TPS                                       0.497305 ± 1.248260
SPS                                       0.037021 ± 0.064140
Avg GPU Mem (MB)                       3851.041773 ± 0.905747
Peak GPU Mem (MB)                      3851.667234 ± 0.744585
Avg GPU Util (%)                         88.006738 ± 1

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
