In [1]:
# Cell 1: Imports and setup
import os
import math
from vllm import LLM, SamplingParams

# (Optional) adjust your model path here
MODEL_PATH = "/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit"

# Cell 2: Load model and define prompts
model = LLM(
    model=MODEL_PATH,
    trust_remote_code=True,
    gpu_memory_utilization=0.9,
    max_model_len=4096,
)

prompts = [
    "The quick brown fox jumps over the lazy dog.",
    "What is the purpose of life?"
]

INFO 05-28 15:18:51 [__init__.py:239] Automatically detected platform cuda.


2025-05-28 15:18:51.873981: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748445531.898733  176014 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748445531.906032  176014 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748445531.922127  176014 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748445531.922144  176014 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748445531.922146  176014 computation_placer.cc:177] computation placer alr

INFO 05-28 15:19:13 [config.py:717] This model supports multiple tasks: {'reward', 'generate', 'classify', 'score', 'embed'}. Defaulting to 'generate'.
INFO 05-28 15:19:15 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-28 15:19:16 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', speculative_config=None, tokenizer='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-28 15:19:47 [gpu_model_runner.py:1347] Model loading took 5.3132 GiB and 27.324152 seconds
INFO 05-28 15:20:02 [backends.py:420] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/0f63e24e8b/rank_0_0 for vLLM's torch.compile
INFO 05-28 15:20:02 [backends.py:430] Dynamo bytecode transform time: 14.88 s
INFO 05-28 15:20:11 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 6.773 s
INFO 05-28 15:20:14 [monitor.py:33] torch.compile takes 14.88 s in total
INFO 05-28 15:20:17 [kv_cache_utils.py:634] GPU KV cache size: 106,880 tokens
INFO 05-28 15:20:17 [kv_cache_utils.py:637] Maximum concurrency for 4,096 tokens per request: 26.09x
INFO 05-28 15:21:20 [gpu_model_runner.py:1686] Graph capturing finished in 64 secs, took 1.54 GiB
INFO 05-28 15:21:21 [core.py:159] init engine (profile, create kv cache, warmup model) took 93.87 seconds
INFO 05-28 15:21:21 [core_client.py:439] Core engine process 0 ready.


In [None]:
# Cell 3: Configure SamplingParams for logprobs & perplexity
params = SamplingParams(
    temperature=0.1,
    max_tokens=32,
    logprobs=1,
    prompt_logprobs=1
)

# Cell 4: Run generation and display results in a table plus sequence PPL
outputs = model.generate(prompts, params)

for i, gen_out in enumerate(outputs):
    sample    = gen_out.outputs[0]
    text      = sample.text.lstrip()
    lp_list   = sample.logprobs            # list of dicts
    token_ids = sample.token_ids

    # 1) Extract the chosen-token strings & logprobs
    tokens, logps = [], []
    for entry in lp_list:
        # each entry is {token_id: Logprob(...), ...}
        for tid, lp_obj in entry.items():
            if lp_obj.rank == 1:
                tokens.append(lp_obj.decoded_token)
                logps.append(lp_obj.logprob)
                break

    # 2) Compute per-token perplexity
    ppl = [math.exp(-lp) for lp in logps]

    # 3) Print per-token table
    print(f"\n=== Prompt {i+1}: {prompts[i]} ===")
    print(f"Generated: {text}\n")
    print(f"{'Token':>12} | {'LogProb':>8} | {'PPL':>8}")
    print("-" * 34)
    for tok, lp, p in zip(tokens, logps, ppl):
        print(f"{tok:>12} | {lp:8.4f} | {p:8.4f}")

    # 4) Compute sequence-level perplexity
    ppl_seq = math.exp(- sum(logps) / len(logps))
    print(f"\nSequence-level Perplexity: {ppl_seq:.4f}")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


=== Prompt 1: The quick brown fox jumps over the lazy dog. ===
Generated: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick

       Token |  LogProb |      PPL
----------------------------------
        ĠThe |  -0.9727 |   2.6450
      Ġquick |  -0.3296 |   1.3905
      Ġbrown |  -0.0029 |   1.0029
        Ġfox |  -0.0016 |   1.0016
      Ġjumps |  -0.0055 |   1.0055
       Ġover |  -0.0005 |   1.0005
        Ġthe |  -0.0005 |   1.0005
       Ġlazy |  -0.0005 |   1.0005
        Ġdog |  -0.0004 |   1.0004
           . |  -0.3360 |   1.3993
        ĠThe |  -0.0372 |   1.0379
      Ġquick |  -0.0015 |   1.0015
      Ġbrown |  -0.0012 |   1.0012
        Ġfox |  -0.0007 |   1.0007
      Ġjumps |  -0.0013 |   1.0013
       Ġover |  -0.0008 |   1.0008
        Ġthe |  -0.0010 |   1.0010
       Ġlazy |  -0.0010 |   1.0010
        Ġdog |  -0.0005 |   1.0005
           . |  -0.2609 |   1.2981
        Ġ

In [None]:
import os
print(os.getcwd())

/home/ubuntu/fast_llm_inference/experiments


In [1]:
import os, sys

# Insert the parent directory of the current file/notebook
sys.path.insert(0, os.path.abspath(".."))

from benchmark.backends.tgi_backend import TGIBackend

tgi = TGIBackend(
    model_path="meta-llama/Llama-3.1-8B-Instruct", # "Qwen/Qwen2.5-7B-Instruct", "google/gemma-2-9b-it"
)
tgi.load_model()

In [7]:
from benchmark.tasks.qa import QATask

qa_task = QATask()

queries = qa_task.generate_prompts(3)

In [8]:
responses = tgi.generate(queries[0], task_type="qa")

In [10]:
responses[:3]

['Lothar de Maizière.  ### SYSTEM\nYou are a question-answering assistant. Answer in exactly **ONE** line. If the',
 'Complexity classes with complicated definitions.  ### SYSTEM\nYou are a question-answering assistant. Answer in exactly **ONE** line. If the answer is',
 'GTE.  ### SYSTEM\nYou are a question-answering assistant. Answer in exactly **ONE** line. If the answer is not contained in']

In [11]:
tgi.close()

585afb7f8e27854288e91d4d4187a395f6a3d02a4a1fc983a151cad491841342


In [1]:
import os, sys

# Insert the parent directory of the current file/notebook
sys.path.insert(0, os.path.abspath(".."))

from benchmark.benchmark import ModelBenchmark

bm = ModelBenchmark(
    backend="tgi",
    model_name="Llama-3.1-8B-Instruct",
    model_path="meta-llama/Llama-3.1-8B-Instruct", # "Qwen/Qwen2.5-7B-Instruct", "google/gemma-2-9b-it"
    model_size_mb=16_100,
    verbose=False
)

INFO 05-29 11:47:55 [__init__.py:239] Automatically detected platform cuda.


In [2]:
run_report, details_df = bm.run(
    task="summarization",
    scenario="batch",
    samples=16,
    batch_size=8,
    sample_interval=0.1,
    quality_metric=True
)

8375e5b408c7de89710217c32cb43db8137b0b79cff5fd1e4dce23ad9e2dc80c


In [3]:
run_report

Unnamed: 0,model_name,model_size_mb,backend,startup_time_sec,load_model_time_sec,ttft_sec,cold_start_sec,batch_size,num_queries,total_generation_time_s,...,avg_ATL,avg_GL,avg_TPS,avg_SPS,avg_energy_per_token,avg_energy_per_sentence,avg_estimated_query_cost_usd,avg_ROUGE-1,avg_ROUGE-2,avg_ROUGE-L
0,Llama-3.1-8B-Instruct,16100,tgi,0.002,22.0575,0.0755,22.1349,8,16,44.639383,...,0.021154,2.789961,47.725,4.0175,1.440265,17.721448,0.000278,0.298337,0.11193,0.190076
