### RQ1 - Comparing different quantization levels

In [None]:
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(backend, model_name, task, base_path="/home/ubuntu/fast_llm_inference/models", samples=500, verbose=False, batch_size=100):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/{model_name}",
            task=task,
            verbose=verbose,
        )
        bm.run(samples=samples, batch_size=batch_size)
        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task}")
    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} -- {e}")
        torch.cuda.empty_cache()  # ensure no memory leak on error


base_path = "/home/ubuntu/fast_llm_inference/models"

backends = ["vllm"] #, "huggingface","deepspeed_mii", "llama.cpp"]
models   = [
   # "llama-3.1-8B-Instruct",
   # "llama-3.1-8B-Instruct-4bit",
   # "llama-3.1-8B-Instruct-8bit",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-3.2-3b-instruct-4bit",
    "llama-3.2-1b-instruct-4bit",
    "llama-3.2-3b-instruct-8bit",
    "llama-3.2-1b-instruct-8bit",
   
   # "Qwen2.5-7B-Instruct",
   # "Qwen2.5-7B-Instruct-4bit",
   ### "Qwen2.5-7B-Instruct-8bit", # some weird error
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
    "Qwen2.5-3B-Instruct-4bit",
    "Qwen2.5-1.5B-Instruct-4bit",
    "Qwen2.5-0.5B-Instruct-4bit",
    "Qwen2.5-3B-Instruct-8bit",
    "Qwen2.5-1.5B-Instruct-8bit",
    "Qwen2.5-0.5B-Instruct-8bit",


    #"gemma-2-9b-it-bnb4",
    #"gemma-2-9b-it-8bit",
    ### "gemma-2-9b-it", # too large
    #"gemma-2-2b-it-4bit",
    #"gemma-2-2b-it-8bit",
    #"gemma-2-2b-it",
]
tasks    = ["summarization", "qa", "sql",]

for backend in backends:
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=500,
                verbose=False,
                batch_size=100,
            )

### RQ2 - Comapring different inference engines

In [None]:
from benchmark.benchmark import ModelBenchmark
import torch


def run_benchmark(backend, model_name, task, base_path="/home/ubuntu/fast_llm_inference/models", samples=500, verbose=False, batch_size=100):
    print(f"Running benchmark for {model_name} with {backend} on {task}")
    try:
        bm = ModelBenchmark(
            backend=backend,
            model_name=model_name,
            model_path=f"{base_path}/{model_name}",
            task=task,
            verbose=verbose,
        )
        bm.run(samples=samples, batch_size=batch_size)
        bm.close()
        del bm
        torch.cuda.empty_cache()
        print(f"✅ Completed: {model_name} | {backend} | {task}")
    except Exception as e:
        print(f"❌ Failed: {model_name} | {backend} | {task} -- {e}")
        torch.cuda.empty_cache()  # ensure no memory leak on error


base_path = "/home/ubuntu/fast_llm_inference/models"

backends = ["huggingface"] #"llama.cpp"] #"vllm" ,"deepspeed_mii", "huggingface"]

models   = [
    "gemma-2-9b-it", 
    "gemma-2-2b-it",

    "llama-3.1-8B-Instruct",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
   
    "Qwen2.5-7B-Instruct",
    "Qwen2.5-3B-Instruct",
    "Qwen2.5-1.5B-Instruct",
    "Qwen2.5-0.5B-Instruct",
]

tasks    = ["summarization", "qa", "sql",]

for backend in backends:
    if backend == "llama.cpp":
        models = [
            "gemma-2-2b-it-fp16.gguf",
            "gemma-2-9b-it-fp16.gguf",

            "llama-3.1-8B-Instruct-f16.gguf",
            "Llama-3.2-1B-Instruct-f16.gguf",
            "Llama-3.2-3B-Instruct-f16.gguf",
            
            "qwen2.5-0.5b-instruct-fp16.gguf",
            "qwen2.5-1.5b-instruct-fp16.gguf",
            "qwen2.5-3b-instruct-fp16.gguf",
            "qwen2.5-7B-instruct-fp16.gguf",
        ]
        
    for model in models:
        for task in tasks:
            run_benchmark(
                backend=backend,
                model_name=model,
                task=task,
                base_path=base_path,
                samples=100,
                verbose=False,
                batch_size=20,
            )

2025-05-11 01:47:22.335784: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746928042.686955  715944 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746928042.790989  715944 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746928043.633385  715944 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746928043.633515  715944 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746928043.633520  715944 computation_placer.cc:177] computation placer alr

INFO 05-11 01:47:36 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-11 01:47:36 [__init__.py:239] Automatically detected platform cuda.
[2025-05-11 01:47:43,255] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Running benchmark for qwen2.5-3b-instruct-fp16.gguf with llama.cpp on summarization


llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


Stats for qwen2.5-3b-instruct-fp16.gguf on llama.cpp/summarization:
prompt_length                       5467.110000 ± 1780.621183
TTFT                                      0.037240 ± 0.002196
ATL                                       1.907586 ± 0.404739
GL                                      103.282380 ± 6.380167
TPS                                       0.548400 ± 0.119880
SPS                                       0.034100 ± 0.007797
Avg GPU Mem (MB)                       7188.030000 ± 5.935774
Peak GPU Mem (MB)                      7189.480000 ± 6.407067
Avg GPU Util (%)                         91.788000 ± 0.394477
Peak GPU Util (%)                       100.000000 ± 0.000000
Total Energy (Wh)                         2.051205 ± 0.128664
Avg Power (W)                            71.492000 ± 0.212099
Peak Power (W)                           74.938000 ± 0.328658
Energy per Token (J/token)             136.380099 ± 28.955550
Energy per Sentence (J/sentence)     2268.556692 ± 555.147749
Me

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


Stats for qwen2.5-3b-instruct-fp16.gguf on llama.cpp/qa:
prompt_length                       1393.740000 ± 302.397244
TTFT                                     0.035200 ± 0.000360
ATL                                    24.702549 ± 11.375563
GL                                      35.124600 ± 0.881287
TPS                                      0.063900 ± 0.071264
SPS                                      0.031200 ± 0.005908
Avg GPU Mem (MB)                      7167.946000 ± 2.506683
Peak GPU Mem (MB)                     7168.680000 ± 2.412091
Avg GPU Util (%)                        91.738000 ± 0.576138
Peak GPU Util (%)                       95.000000 ± 0.000000
Total Energy (Wh)                        0.698264 ± 0.018237
Avg Power (W)                           71.566000 ± 0.394590
Peak Power (W)                          74.466000 ± 0.552701
Energy per Token (J/token)          1768.265073 ± 815.255899
Energy per Sentence (J/sentence)    2463.341444 ± 255.088205
Memory Usage (MB)           

llama_init_from_model: n_ctx_per_seq (8192) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


Stats for qwen2.5-3b-instruct-fp16.gguf on llama.cpp/sql:
prompt_length                        1163.740000 ± 264.891883
TTFT                                      0.035960 ± 0.000972
ATL                                       4.107402 ± 3.616989
GL                                       56.953800 ± 5.185162
TPS                                       0.400000 ± 0.247055
SPS                                       0.096800 ± 0.069904
Avg GPU Mem (MB)                       7165.032000 ± 1.042286
Peak GPU Mem (MB)                      7165.080000 ± 0.984732
Avg GPU Util (%)                         91.832000 ± 0.368968
Peak GPU Util (%)                        95.000000 ± 0.000000
Total Energy (Wh)                         1.135422 ± 0.104077
Avg Power (W)                            71.766000 ± 0.182198
Peak Power (W)                           73.934000 ± 0.462475
Energy per Token (J/token)            294.793370 ± 259.616610
Energy per Sentence (J/sentence)    1812.023837 ± 1647.471307
Memory Usage