### RQ1 - Comparing different quantization levels

**Quantization Dimension**

How does quantization in different models and architectures affect system and task-specific metrics.

In [3]:
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(backend, model_name, task, base_path, samples=500, verbose=False, batch_size=100):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/models/{model_name}",
            base_path=base_path,
            task=task,
            verbose=verbose,
        )
        bm.run(samples=samples, batch_size=batch_size)
        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task}")
    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} -- {e}")
        torch.cuda.empty_cache()  # ensure no memory leak on error

2025-05-12 15:48:54.090065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747064934.110013  260255 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747064934.115811  260255 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747064934.135655  260255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747064934.135683  260255 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747064934.135685  260255 computation_placer.cc:177] computation placer alr

INFO 05-12 15:48:57 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-12 15:48:57 [__init__.py:239] Automatically detected platform cuda.
[2025-05-12 15:49:00,664] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [7]:
base_path = "/home/ubuntu/fast_llm_inference/"

backends = ["vllm"] #, "huggingface","deepspeed_mii", "llama.cpp"]
models   = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit", # some weird error
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",


    "gemma-2-9b-it-bnb4",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it", # too large
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]

tasks    = ["summarization", "qa", "sql",]

first run

In [None]:
for backend in backends:
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=100,
            )

check if anything is missing

In [12]:
import os

# Define your parameters
backends = ["vllm"]
models = [
    "llama-3.1-8B-Instruct",
    "llama-3.1-8B-Instruct-4bit",
    "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-7B-Instruct-4bit",
    "Qwen2.5-7B-Instruct-8bit",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",
    "gemma-2-9b-it-4bit",
    "gemma-2-9b-it-8bit",
    "gemma-2-9b-it",
    "gemma-2-2b-it-4bit",
    "gemma-2-2b-it-8bit",
    "gemma-2-2b-it",
]
tasks = ["summarization", "qa", "sql"]

results_dir = "./results/experiment_1/"

missing_models = set()

for backend in backends:
    for model in models:
        for task in tasks:
            filename = f"{backend}_{model}_{task}.csv"
            filepath = os.path.join(results_dir, filename)
            if not os.path.exists(filepath):
                missing_models.add(model)
                break  # No need to check more tasks if one is missing

# Print models with missing files
if missing_models:
    print("Models with missing files:")
    for model in sorted(missing_models):
        print(model)
else:
    print("✅ All models are complete.")

Models with missing files:
Qwen2.5-7B-Instruct-8bit
gemma-2-9b-it


try again with the missing models

In [None]:
for backend in backends:
    for model in list(missing_models):
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=500,
                verbose=False,
                batch_size=100,
            )

### RQ2 - Comparing different inference engines

**Framework Dimension** 

Which inference framework (Transformers, vLLM, DeepSpeed MII,172
LMDeploy, llama.cpp) strikes the best balance between system resource usage (e.g., GPU173
utilization, joules/token) and system performance (tokens/s)?174

In [None]:
base_path = "/home/ubuntu/fast_llm_inference/models"

backends = ["vllm", "huggingface", "llama.cpp"] #,"deepspeed_mii", "huggingface"]

models   = [
    "gemma-2-9b-it", 
    "gemma-2-2b-it",

    "llama-3.1-8B-Instruct",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
]

tasks    = ["summarization", "qa", "sql",]

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",

            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
        
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=20,
            )

#### RQ 3 - Comparing different use cases

**Scenario/Workload Dimension**

How do locally deployed LLMs and inference backends
perform and scale across the three dominant inference scenarios—single - stream (single user),
batched offline processing, and multi- user server workloads? Do system metrics – throughput,
GPU utilization, joules/token — evolve as the average number of queries per second varies
over time?

In [1]:
from typing import Optional
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(
    backend: str,
    model_name: str,
    task: str,
    base_path: str,
    scenario: str = "batch",            # "single", "batch", or "server"
    run_time: float = None,             # only for server: total time in seconds
    requests_per_sec: float = None,     # only for server: λ (req/s)
    batch_size: int = 100,              # only for batch
    max_batch_size: int = None,         # only for server: cap per-batch size
    sample_interval: float = 0.1,       # telemetry interval (s)
    export_path: Optional[str] = None,  # custom export path for server scenario
    verbose: bool = False
):
    print(f"Running benchmark for {model_name} with {backend} on {task} [{scenario}]")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/models/{model_name}",
            base_path=base_path,
            task=task,
            verbose=verbose,
        )

        if scenario == "server":
            assert run_time is not None,    "Must set run_time in server mode"
            assert requests_per_sec is not None, "Must set requests_per_sec in server mode"
            df = bm.run(
                scenario="server",
                run_time=run_time,
                requests_per_sec=requests_per_sec,
                sample_interval=sample_interval,
                max_batch_size=max_batch_size,
                export_path=export_path
            )

        elif scenario == "single":
            df = bm.run(
                samples=100,        # samples ignored
                batch_size=1,
                scenario="single",
                sample_interval=sample_interval,
                export_path=export_path
            )

        elif scenario == "batch":
            df = bm.run(
                samples=100,        # samples ignored
                batch_size=batch_size,
                scenario="batch",
                sample_interval=sample_interval,
                export_path=export_path
            )
        else:
            raise ValueError(f"Unknown scenario: {scenario}")

        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task} | {scenario}")
        return df

    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} | {scenario} -- {e}")
        torch.cuda.empty_cache()
        return None


2025-05-16 03:42:40.554419: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747366960.579138 1160723 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747366960.586688 1160723 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747366960.607277 1160723 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747366960.607300 1160723 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747366960.607303 1160723 computation_placer.cc:177] computation placer alr

INFO 05-16 03:42:48 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-16 03:42:49 [__init__.py:239] Automatically detected platform cuda.
[2025-05-16 03:42:55,873] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
base_path = "/home/ubuntu/fast_llm_inference"
backends = ["vllm", "llama.cpp", "huggingface"]
tasks    = ["sql"]
server_rps      = [1, 2, 4, 8]
run_time        = 120.0     # seconds
sample_interval = 0.05      # s
max_batch_size  = 64        # cap per batch

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",
            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
    else:
        models = [
            "gemma-2-2b-it",
            "llama-3.1-8B-Instruct",
            "llama-3.2-3b-instruct",
            "llama-3.2-1b-instruct",
            "Qwen2.5-7B-Instruct",
            "Qwen2.5-3B-Instruct",
            "Qwen2.5-1.5B-Instruct",
            "Qwen2.5-0.5B-Instruct",
        ]
        if backend != "vllm":
            models.append("gemma-2-9b-it") # too large for vllm

    for model in models:
        for task in tasks:
            for rps in server_rps:
                export_path = f"{base_path}/results/{backend}_{model}_{task}_{rps}QPS_{int(run_time)}s_server.csv"
                print(f"→ {backend} | {model} | {task} @ {rps} QPS for {run_time}s -> {export_path}")
                run_benchmark(
                    backend=backend,
                    model_name=model,
                    task=task,
                    base_path=base_path,
                    scenario="server",
                    run_time=run_time,
                    requests_per_sec=rps,
                    sample_interval=sample_interval,
                    max_batch_size=max_batch_size,
                    export_path=export_path,
                    verbose=False
                )

: 

part 2 - batch processing

In [2]:
base_path = "/home/ubuntu/fast_llm_inference"
backends  = ["huggingface"] #"vllm", "llama.cpp", 
tasks     = ["summarization"]
batch_sizes = [1, 8, 16, 32, 64]   # ← as requested
sample_interval = 0.05             # s
max_batch_size  = 64               # keep the same cap

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",
            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
    else:
        models = [
            #"gemma-2-2b-it",
            "llama-3.1-8B-Instruct",
            "llama-3.2-3b-instruct",
            "llama-3.2-1b-instruct",
            "Qwen2.5-7B-Instruct",
            "Qwen2.5-3B-Instruct",
            "Qwen2.5-1.5B-Instruct",
            "Qwen2.5-0.5B-Instruct",
        ]
        if backend != "vllm":
            models.append("gemma-2-9b-it")   # too large for vllm

    for model in models:
        for task in tasks:
            for bs in batch_sizes:
                export_path = (
                    f"{base_path}/results/"
                    f"{backend}_{model}_{task}_{bs}batch.csv"
                )
                print(f"→ {backend} | {model} | {task} @ batch={bs} -> {export_path}")
                run_benchmark(
                    backend=backend,
                    model_name=model,
                    task=task,
                    base_path=base_path,
                    scenario="batch",
                    batch_size=bs,
                    sample_interval=sample_interval,
                    max_batch_size=max_batch_size,
                    export_path=export_path,
                    verbose=False,
                )


Stats for Qwen2.5-3B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.043336 ± 0.012165
ATL                                       0.890493 ± 0.207921
GL                                       44.109264 ± 5.849148
TPS                                       1.199600 ± 0.367668
SPS                                       0.076600 ± 0.026713
Avg GPU Mem (MB)                      7177.901200 ± 44.251661
Peak GPU Mem (MB)                     7251.640000 ± 69.536579
Avg GPU Util (%)                         80.702400 ± 1.501442
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         0.865047 ± 0.115001
Avg Power (W)                            70.602000 ± 0.336614
Peak Power (W)                           74.170000 ± 0.509284
Energy per Token (J/token)              62.871192 ± 14.701504
Energy per Sentence (J/sentence)     1014.099423 ± 256.092187
Memory Usa

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Stats for Qwen2.5-3B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.041168 ± 0.003897
ATL                                       1.775063 ± 0.438429
GL                                      87.953128 ± 14.472356
TPS                                       0.643700 ± 0.403494
SPS                                       0.038900 ± 0.027703
Avg GPU Mem (MB)                      7255.056400 ± 51.561808
Peak GPU Mem (MB)                     7358.040000 ± 95.962821
Avg GPU Util (%)                         80.292400 ± 1.618048
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         1.724037 ± 0.284837
Avg Power (W)                            70.550000 ± 0.602379
Peak Power (W)                           74.577600 ± 0.521778
Energy per Token (J/token)             125.276888 ± 31.096206
Energy per Sentence (J/sentence)     2018.691576 ± 529.264784
Memory Usa

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Stats for Qwen2.5-3B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.044432 ± 0.004252
ATL                                       3.636051 ± 0.973919
GL                                     179.934928 ± 34.050334
TPS                                       0.366300 ± 0.453330
SPS                                       0.025100 ± 0.028902
Avg GPU Mem (MB)                      7306.067200 ± 79.515569
Peak GPU Mem (MB)                    7413.080000 ± 104.449688
Avg GPU Util (%)                         78.290000 ± 2.641340
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         3.531741 ± 0.662888
Avg Power (W)                            70.664400 ± 0.529792
Peak Power (W)                           74.552400 ± 0.224432
Energy per Token (J/token)             256.913694 ± 68.432126
Energy per Sentence (J/sentence)    4142.222212 ± 1187.406503
Memory Usa

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Stats for Qwen2.5-3B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.037976 ± 0.000434
ATL                                       6.283387 ± 2.243199
GL                                     307.553112 ± 82.721832
TPS                                       0.184500 ± 0.079039
SPS                                       0.014000 ± 0.005505
Avg GPU Mem (MB)                      7369.782800 ± 24.955491
Peak GPU Mem (MB)                     7609.160000 ± 97.448467
Avg GPU Util (%)                         80.193600 ± 3.864169
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         6.061515 ± 1.602698
Avg Power (W)                            71.063200 ± 0.419704
Peak Power (W)                           75.269600 ± 0.173671
Energy per Token (J/token)            445.727315 ± 157.439062
Energy per Sentence (J/sentence)    7179.554992 ± 2702.742526
Memory Usa

Device set to use cuda:0


Stats for Qwen2.5-1.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.035559 ± 0.005939
ATL                                       0.042970 ± 0.006811
GL                                        3.392327 ± 1.127456
TPS                                      23.714500 ± 2.927343
SPS                                       1.537300 ± 0.314655
Avg GPU Mem (MB)                      3991.456900 ± 46.438190
Peak GPU Mem (MB)                     3996.080000 ± 46.549203
Avg GPU Util (%)                         49.088300 ± 4.804947
Peak GPU Util (%)                       65.730000 ± 14.013597
Total Energy (Wh)                         0.055163 ± 0.019309
Avg Power (W)                            58.254600 ± 2.794041
Peak Power (W)                           63.401400 ± 2.292122
Energy per Token (J/token)                2.496276 ± 0.363754
Energy per Sentence (J/sentence)         39.614598 ± 9.095874
Memory U

Device set to use cuda:0


Stats for Qwen2.5-1.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.032260 ± 0.008642
ATL                                       0.329537 ± 0.111393
GL                                       24.033352 ± 3.511791
TPS                                       3.412000 ± 1.287177
SPS                                       0.215300 ± 0.078707
Avg GPU Mem (MB)                      4112.424800 ± 31.876998
Peak GPU Mem (MB)                     4170.680000 ± 55.320553
Avg GPU Util (%)                         58.124800 ± 2.463072
Peak GPU Util (%)                        93.440000 ± 5.424077
Total Energy (Wh)                         0.424253 ± 0.062221
Avg Power (W)                            63.555200 ± 1.321664
Peak Power (W)                           68.734400 ± 0.660383
Energy per Token (J/token)               20.933554 ± 7.047389
Energy per Sentence (J/sentence)       320.091692 ± 79.144415
Memory U

Device set to use cuda:0


Stats for Qwen2.5-1.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.029964 ± 0.000225
ATL                                       0.650746 ± 0.231205
GL                                       47.572868 ± 8.435814
TPS                                       1.848900 ± 1.180956
SPS                                       0.119000 ± 0.086264
Avg GPU Mem (MB)                      4172.945200 ± 46.604553
Peak GPU Mem (MB)                     4257.400000 ± 85.547610
Avg GPU Util (%)                         58.374400 ± 1.893248
Peak GPU Util (%)                        94.520000 ± 4.051387
Total Energy (Wh)                         0.846878 ± 0.148791
Avg Power (W)                            64.044800 ± 1.150470
Peak Power (W)                           68.911200 ± 0.866109
Energy per Token (J/token)              41.722605 ± 14.888631
Energy per Sentence (J/sentence)      636.791454 ± 167.810346
Memory U

Device set to use cuda:0


Stats for Qwen2.5-1.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.030064 ± 0.000789
ATL                                       1.264664 ± 0.468833
GL                                      92.380072 ± 17.284702
TPS                                       1.081000 ± 1.238386
SPS                                       0.067800 ± 0.093610
Avg GPU Mem (MB)                      4213.000400 ± 70.936104
Peak GPU Mem (MB)                     4302.200000 ± 95.655408
Avg GPU Util (%)                         59.971200 ± 0.589609
Peak GPU Util (%)                        96.920000 ± 4.234729
Total Energy (Wh)                         1.677576 ± 0.315001
Avg Power (W)                            65.276400 ± 0.562353
Peak Power (W)                           70.020400 ± 0.200776
Energy per Token (J/token)              82.677885 ± 30.718723
Energy per Sentence (J/sentence)     1262.447219 ± 356.811951
Memory U

Device set to use cuda:0


Stats for Qwen2.5-1.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.036652 ± 0.005162
ATL                                       2.467923 ± 0.950971
GL                                     179.710404 ± 45.317686
TPS                                       0.475800 ± 0.212228
SPS                                       0.028600 ± 0.012228
Avg GPU Mem (MB)                      4266.044400 ± 18.471791
Peak GPU Mem (MB)                     4459.560000 ± 78.151741
Avg GPU Util (%)                         53.317200 ± 0.955188
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         3.112621 ± 0.808937
Avg Power (W)                            62.202800 ± 0.612671
Peak Power (W)                           68.163200 ± 0.786342
Energy per Token (J/token)             153.835704 ± 60.030974
Energy per Sentence (J/sentence)     2347.973179 ± 738.469805
Memory U

Device set to use cuda:0


Stats for Qwen2.5-0.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.026622 ± 0.001984
ATL                                       0.067387 ± 0.029782
GL                                        5.607441 ± 1.349760
TPS                                      17.968700 ± 8.073792
SPS                                       1.110000 ± 0.626945
Avg GPU Mem (MB)                      1768.333700 ± 24.619551
Peak GPU Mem (MB)                     1770.480000 ± 24.456104
Avg GPU Util (%)                         29.925900 ± 1.980484
Peak GPU Util (%)                        35.490000 ± 6.039424
Total Energy (Wh)                         0.073994 ± 0.018091
Avg Power (W)                            47.372500 ± 1.198639
Peak Power (W)                           49.504900 ± 1.527811
Energy per Token (J/token)                3.197635 ± 1.418702
Energy per Sentence (J/sentence)        54.104209 ± 30.684095
Memory U

Device set to use cuda:0


Stats for Qwen2.5-0.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.026556 ± 0.001170
ATL                                       0.547821 ± 0.255998
GL                                       43.732508 ± 5.382846
TPS                                       2.208100 ± 0.998969
SPS                                       0.131500 ± 0.058368
Avg GPU Mem (MB)                      1813.926400 ± 25.423092
Peak GPU Mem (MB)                     1834.120000 ± 27.445122
Avg GPU Util (%)                         31.418800 ± 0.743512
Peak GPU Util (%)                       57.840000 ± 10.604668
Total Energy (Wh)                         0.590118 ± 0.076151
Avg Power (W)                            48.525200 ± 0.500538
Peak Power (W)                           52.366400 ± 1.193951
Energy per Token (J/token)              26.616395 ± 12.515467
Energy per Sentence (J/sentence)      439.510849 ± 234.097730
Memory U

Device set to use cuda:0


Stats for Qwen2.5-0.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.030912 ± 0.003834
ATL                                       1.168319 ± 0.566697
GL                                      93.387536 ± 16.622109
TPS                                       1.123000 ± 0.841155
SPS                                       0.066800 ± 0.050350
Avg GPU Mem (MB)                      1832.974800 ± 20.807874
Peak GPU Mem (MB)                     1870.600000 ± 40.930115
Avg GPU Util (%)                         29.392400 ± 1.948860
Peak GPU Util (%)                        52.160000 ± 6.784891
Total Energy (Wh)                         1.229582 ± 0.207937
Avg Power (W)                            47.502000 ± 1.226897
Peak Power (W)                           52.322400 ± 2.028124
Energy per Token (J/token)              55.413391 ± 26.752605
Energy per Sentence (J/sentence)      916.878039 ± 517.769281
Memory U

Device set to use cuda:0


Stats for Qwen2.5-0.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.031344 ± 0.003949
ATL                                       2.195871 ± 1.074638
GL                                     175.279304 ± 31.890578
TPS                                       0.666200 ± 0.794084
SPS                                       0.039900 ± 0.047662
Avg GPU Mem (MB)                      1870.241600 ± 34.011821
Peak GPU Mem (MB)                     1918.600000 ± 48.995997
Avg GPU Util (%)                         30.971600 ± 0.738306
Peak GPU Util (%)                        55.360000 ± 2.524706
Total Energy (Wh)                         2.381031 ± 0.432582
Avg Power (W)                            48.868800 ± 0.407568
Peak Power (W)                           54.216000 ± 0.161808
Energy per Token (J/token)             107.384290 ± 52.508403
Energy per Sentence (J/sentence)     1773.906019 ± 997.931059
Memory U

Device set to use cuda:0


Stats for Qwen2.5-0.5B-Instruct on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.029860 ± 0.001688
ATL                                       3.836238 ± 2.075804
GL                                     305.392596 ± 75.699320
TPS                                       0.331700 ± 0.167839
SPS                                       0.018600 ± 0.009746
Avg GPU Mem (MB)                      1882.446400 ± 15.311952
Peak GPU Mem (MB)                     1974.440000 ± 26.050580
Avg GPU Util (%)                         30.927200 ± 0.352165
Peak GPU Util (%)                        59.800000 ± 2.412091
Total Energy (Wh)                         4.138161 ± 1.042800
Avg Power (W)                            48.719200 ± 0.255682
Peak Power (W)                           54.310400 ± 0.067539
Energy per Token (J/token)            187.144790 ± 101.747870
Energy per Sentence (J/sentence)    3104.012676 ± 1986.630358
Memory U

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


Stats for gemma-2-9b-it on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.109611 ± 0.008780
ATL                                       0.151071 ± 0.015849
GL                                        7.422561 ± 1.087148
TPS                                       6.689900 ± 0.684713
SPS                                       0.437200 ± 0.075893
Avg GPU Mem (MB)                    19963.914400 ± 376.959717
Peak GPU Mem (MB)                   19969.400000 ± 378.250531
Avg GPU Util (%)                         93.962000 ± 3.928514
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         0.144312 ± 0.021550
Avg Power (W)                            69.969700 ± 0.750354
Peak Power (W)                           73.197800 ± 0.582042
Energy per Token (J/token)               10.568225 ± 1.092814
Energy per Sentence (J/sentence)       164.402765 ± 25.974715
Memory Usage (MB

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Stats for gemma-2-9b-it on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.107660 ± 0.003864
ATL                                       1.192401 ± 0.220626
GL                                       57.734260 ± 7.015240
TPS                                       0.875700 ± 0.222167
SPS                                       0.056700 ± 0.016271
Avg GPU Mem (MB)                    20426.182400 ± 449.369862
Peak GPU Mem (MB)                   20727.720000 ± 462.983533
Avg GPU Util (%)                         97.914000 ± 1.064627
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         1.145000 ± 0.140313
Avg Power (W)                            71.380800 ± 0.153239
Peak Power (W)                           74.233600 ± 0.760786
Energy per Token (J/token)              85.131952 ± 15.810740
Energy per Sentence (J/sentence)     1312.420154 ± 218.123601
Memory Usage (MB

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Stats for gemma-2-9b-it on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.109636 ± 0.003800
ATL                                       2.356189 ± 0.519138
GL                                     113.976040 ± 18.238956
TPS                                       0.472400 ± 0.263604
SPS                                       0.031500 ± 0.020170
Avg GPU Mem (MB)                    20658.990000 ± 500.739588
Peak GPU Mem (MB)                   21112.360000 ± 736.629349
Avg GPU Util (%)                         98.144000 ± 0.237801
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         2.269748 ± 0.364277
Avg Power (W)                            71.669200 ± 0.191325
Peak Power (W)                           74.831600 ± 1.066650
Energy per Token (J/token)             168.916413 ± 37.282173
Energy per Sentence (J/sentence)     2605.250354 ± 513.770657
Memory Usage (MB

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Stats for gemma-2-9b-it on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.111012 ± 0.000566
ATL                                       4.723013 ± 1.135989
GL                                     228.547548 ± 42.232385
TPS                                       0.268700 ± 0.296950
SPS                                       0.016200 ± 0.022820
Avg GPU Mem (MB)                    20921.379200 ± 570.148475
Peak GPU Mem (MB)                   21401.240000 ± 673.089718
Avg GPU Util (%)                         97.641200 ± 0.144287
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         4.557379 ± 0.843918
Avg Power (W)                            71.754000 ± 0.185973
Peak Power (W)                           75.909600 ± 2.076478
Energy per Token (J/token)             339.040985 ± 81.629997
Energy per Sentence (J/sentence)    5233.226280 ± 1133.632431
Memory Usage (MB

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Stats for gemma-2-9b-it on huggingface/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.115560 ± 0.001688
ATL                                       8.234410 ± 2.302352
GL                                     397.219252 ± 88.580077
TPS                                       0.132300 ± 0.040323
SPS                                       0.010000 ± 0.000000
Avg GPU Mem (MB)                    21224.536400 ± 516.544411
Peak GPU Mem (MB)                     22247.480000 ± 4.824182
Avg GPU Util (%)                         96.945600 ± 0.221912
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         7.913674 ± 1.758998
Avg Power (W)                            71.736800 ± 0.062714
Peak Power (W)                           75.580000 ± 0.603023
Energy per Token (J/token)            590.585689 ± 164.773715
Energy per Sentence (J/sentence)    9036.273685 ± 2187.584903
Memory Usage (MB

In [1]:
from benchmark.backends.vllm_backend import VLLMBackend

model = VLLMBackend("/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit")
model.load_model()

INFO 05-18 11:20:03 [__init__.py:239] Automatically detected platform cuda.


2025-05-18 11:20:04.154985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747567204.173094   11479 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747567204.178287   11479 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747567204.194063   11479 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747567204.194074   11479 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747567204.194076   11479 computation_placer.cc:177] computation placer alr

INFO 05-18 11:20:24 [config.py:717] This model supports multiple tasks: {'generate', 'reward', 'score', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 05-18 11:20:26 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-18 11:20:27 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', speculative_config=None, tokenizer='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-18 11:20:33 [gpu_model_runner.py:1347] Model loading took 5.3132 GiB and 3.308005 seconds
INFO 05-18 11:20:44 [backends.py:420] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/a89f85ea99/rank_0_0 for vLLM's torch.compile
INFO 05-18 11:20:44 [backends.py:430] Dynamo bytecode transform time: 11.28 s
INFO 05-18 11:20:51 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 5.912 s
INFO 05-18 11:20:54 [monitor.py:33] torch.compile takes 11.28 s in total
INFO 05-18 11:20:57 [kv_cache_utils.py:634] GPU KV cache size: 106,880 tokens
INFO 05-18 11:20:57 [kv_cache_utils.py:637] Maximum concurrency for 8,192 tokens per request: 13.05x
INFO 05-18 11:22:05 [gpu_model_runner.py:1686] Graph capturing finished in 68 secs, took 1.54 GiB
INFO 05-18 11:22:06 [core.py:159] init engine (profile, create kv cache, warmup model) took 92.94 seconds
INFO 05-18 11:22:06 [core_client.py:439] Core engine process 0 ready.


In [2]:
model.generate("What is the purpose of life?", perplexity=True)

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

TypeError: list indices must be integers or slices, not str

In [None]:
# Cell 1: Imports and setup
import os
import math
from vllm import LLM, SamplingParams

# (Optional) adjust your model path here
MODEL_PATH = "/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit"

# Cell 2: Load model and define prompts
model = LLM(
    model=MODEL_PATH,
    trust_remote_code=True,
    gpu_memory_utilization=0.9,
    max_model_len=4096,
)

prompts = [
    "The quick brown fox jumps over the lazy dog.",
    "What is the purpose of life?"
]


INFO 05-18 11:26:48 [__init__.py:239] Automatically detected platform cuda.


2025-05-18 11:26:49.212117: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747567609.229561   12031 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747567609.235248   12031 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747567609.250681   12031 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747567609.250693   12031 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747567609.250695   12031 computation_placer.cc:177] computation placer alr

In [5]:
# Cell 3: Configure SamplingParams for logprobs & perplexity
params = SamplingParams(
    temperature=0.1,
    max_tokens=32,
    logprobs=1,
    prompt_logprobs=1
)

# Cell 4: Run generation and display results in a table
outputs = model.generate(prompts, params)

for i, gen_out in enumerate(outputs):
    sample  = gen_out.outputs[0]
    text    = sample.text.lstrip()
    lp_dict = sample.logprobs

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [6]:
lp_dict

[{2209: Logprob(logprob=-1.4004144668579102, rank=1, decoded_token='ĠIs')},
 {433: Logprob(logprob=-0.17977960407733917, rank=1, decoded_token='Ġit')},
 {311: Logprob(logprob=-0.0869283527135849, rank=1, decoded_token='Ġto')},
 {1505: Logprob(logprob=-0.9659126996994019, rank=1, decoded_token='Ġfind')},
 {23871: Logprob(logprob=-0.04803086444735527, rank=1, decoded_token='Ġhappiness')},
 {11: Logprob(logprob=-0.058653172105550766, rank=1, decoded_token=',')},
 {311: Logprob(logprob=-0.8425228595733643, rank=1, decoded_token='Ġto')},
 {11322: Logprob(logprob=-0.7215710878372192, rank=1, decoded_token='Ġachieve')},
 {2450: Logprob(logprob=-0.043824948370456696, rank=1, decoded_token='Ġsuccess')},
 {11: Logprob(logprob=-0.0013250865740701556, rank=1, decoded_token=',')},
 {477: Logprob(logprob=-0.7743627429008484, rank=2, decoded_token='Ġor'),
  311: Logprob(logprob=-0.6181127429008484, rank=1, decoded_token='Ġto')},
 {311: Logprob(logprob=-0.031453102827072144, rank=1, decoded_token='Ġto

In [15]:
# Cell 4: Run generation and display results in a table
outputs = model.generate(prompts, params)

for i, gen_out in enumerate(outputs):
    sample   = gen_out.outputs[0]
    text     = sample.text.lstrip()
    lp_list  = sample.logprobs            # list of dicts
    token_ids = sample.token_ids

    # Extract the chosen-token strings & logprobs
    tokens, logps = [], []
    for entry in lp_list:
        # each entry is {token_id: Logprob(...), ...}
        for tid, lp_obj in entry.items():
            if lp_obj.rank == 1:
                tokens.append(lp_obj.decoded_token)
                logps.append(lp_obj.logprob)
                break

    # Compute per-token perplexity
    ppl = [math.exp(-lp) for lp in logps]

    print(f"\n=== Prompt {i+1}: {prompts[i]} ===")
    print(f"Generated: {text}\n")
    print(f"{'Token':>12} | {'LogProb':>8} | {'PPL':>8}")
    print("-" * 34)
    for tok, lp, p in zip(tokens, logps, ppl):
        print(f"{tok:>12} | {lp:8.4f} | {p:8.4f}")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


=== Prompt 1: The quick brown fox jumps over the lazy dog. ===
Generated: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick

       Token |  LogProb |      PPL
----------------------------------
        ĠThe |  -0.9727 |   2.6450
      Ġquick |  -0.3296 |   1.3905
      Ġbrown |  -0.0029 |   1.0029
        Ġfox |  -0.0016 |   1.0016
      Ġjumps |  -0.0055 |   1.0055
       Ġover |  -0.0005 |   1.0005
        Ġthe |  -0.0005 |   1.0005
       Ġlazy |  -0.0005 |   1.0005
        Ġdog |  -0.0004 |   1.0004
           . |  -0.3360 |   1.3993
        ĠThe |  -0.0372 |   1.0379
      Ġquick |  -0.0015 |   1.0015
      Ġbrown |  -0.0012 |   1.0012
        Ġfox |  -0.0007 |   1.0007
      Ġjumps |  -0.0013 |   1.0013
       Ġover |  -0.0008 |   1.0008
        Ġthe |  -0.0010 |   1.0010
       Ġlazy |  -0.0010 |   1.0010
        Ġdog |  -0.0005 |   1.0005
           . |  -0.2609 |   1.2981
        Ġ