In [1]:
# Cell 1: Imports and setup
import os
import math
from vllm import LLM, SamplingParams

# (Optional) adjust your model path here
MODEL_PATH = "/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit"

# Cell 2: Load model and define prompts
model = LLM(
    model=MODEL_PATH,
    trust_remote_code=True,
    gpu_memory_utilization=0.9,
    max_model_len=4096,
)

prompts = [
    "The quick brown fox jumps over the lazy dog.",
    "What is the purpose of life?"
]

INFO 05-28 15:18:51 [__init__.py:239] Automatically detected platform cuda.


2025-05-28 15:18:51.873981: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748445531.898733  176014 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748445531.906032  176014 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748445531.922127  176014 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748445531.922144  176014 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748445531.922146  176014 computation_placer.cc:177] computation placer alr

INFO 05-28 15:19:13 [config.py:717] This model supports multiple tasks: {'reward', 'generate', 'classify', 'score', 'embed'}. Defaulting to 'generate'.
INFO 05-28 15:19:15 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-28 15:19:16 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', speculative_config=None, tokenizer='/home/ubuntu/fast_llm_inference/models/llama-3.1-8B-Instruct-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-28 15:19:47 [gpu_model_runner.py:1347] Model loading took 5.3132 GiB and 27.324152 seconds
INFO 05-28 15:20:02 [backends.py:420] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/0f63e24e8b/rank_0_0 for vLLM's torch.compile
INFO 05-28 15:20:02 [backends.py:430] Dynamo bytecode transform time: 14.88 s
INFO 05-28 15:20:11 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 6.773 s
INFO 05-28 15:20:14 [monitor.py:33] torch.compile takes 14.88 s in total
INFO 05-28 15:20:17 [kv_cache_utils.py:634] GPU KV cache size: 106,880 tokens
INFO 05-28 15:20:17 [kv_cache_utils.py:637] Maximum concurrency for 4,096 tokens per request: 26.09x
INFO 05-28 15:21:20 [gpu_model_runner.py:1686] Graph capturing finished in 64 secs, took 1.54 GiB
INFO 05-28 15:21:21 [core.py:159] init engine (profile, create kv cache, warmup model) took 93.87 seconds
INFO 05-28 15:21:21 [core_client.py:439] Core engine process 0 ready.


In [None]:
# Cell 3: Configure SamplingParams for logprobs & perplexity
params = SamplingParams(
    temperature=0.1,
    max_tokens=32,
    logprobs=1,
    prompt_logprobs=1
)

# Cell 4: Run generation and display results in a table plus sequence PPL
outputs = model.generate(prompts, params)

for i, gen_out in enumerate(outputs):
    sample    = gen_out.outputs[0]
    text      = sample.text.lstrip()
    lp_list   = sample.logprobs            # list of dicts
    token_ids = sample.token_ids

    # 1) Extract the chosen-token strings & logprobs
    tokens, logps = [], []
    for entry in lp_list:
        # each entry is {token_id: Logprob(...), ...}
        for tid, lp_obj in entry.items():
            if lp_obj.rank == 1:
                tokens.append(lp_obj.decoded_token)
                logps.append(lp_obj.logprob)
                break

    # 2) Compute per-token perplexity
    ppl = [math.exp(-lp) for lp in logps]

    # 3) Print per-token table
    print(f"\n=== Prompt {i+1}: {prompts[i]} ===")
    print(f"Generated: {text}\n")
    print(f"{'Token':>12} | {'LogProb':>8} | {'PPL':>8}")
    print("-" * 34)
    for tok, lp, p in zip(tokens, logps, ppl):
        print(f"{tok:>12} | {lp:8.4f} | {p:8.4f}")

    # 4) Compute sequence-level perplexity
    ppl_seq = math.exp(- sum(logps) / len(logps))
    print(f"\nSequence-level Perplexity: {ppl_seq:.4f}")

Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


=== Prompt 1: The quick brown fox jumps over the lazy dog. ===
Generated: The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick

       Token |  LogProb |      PPL
----------------------------------
        ĠThe |  -0.9727 |   2.6450
      Ġquick |  -0.3296 |   1.3905
      Ġbrown |  -0.0029 |   1.0029
        Ġfox |  -0.0016 |   1.0016
      Ġjumps |  -0.0055 |   1.0055
       Ġover |  -0.0005 |   1.0005
        Ġthe |  -0.0005 |   1.0005
       Ġlazy |  -0.0005 |   1.0005
        Ġdog |  -0.0004 |   1.0004
           . |  -0.3360 |   1.3993
        ĠThe |  -0.0372 |   1.0379
      Ġquick |  -0.0015 |   1.0015
      Ġbrown |  -0.0012 |   1.0012
        Ġfox |  -0.0007 |   1.0007
      Ġjumps |  -0.0013 |   1.0013
       Ġover |  -0.0008 |   1.0008
        Ġthe |  -0.0010 |   1.0010
       Ġlazy |  -0.0010 |   1.0010
        Ġdog |  -0.0005 |   1.0005
           . |  -0.2609 |   1.2981
        Ġ

Using the official Docker images to launch the inference engines:


##### TGI (check)

docker run --rm \
  --gpus all \
  -v "$HOME/.cache/huggingface:/data" \
  -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
  -e HF_TOKEN="$HF_TOKEN" \
  -p 127.0.0.1:23333:23333 \
  ghcr.io/huggingface/text-generation-inference:3.3.1 \
    --model-id mistralai/Mistral-7B-Instruct-v0.3 \
    --trust-remote-code \
    --port 23333 \
    --max-client-batch-size 128


##### vLLM (check)

docker run --rm \
  --runtime=nvidia --gpus all \
  -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
  -e HUGGING_FACE_HUB_TOKEN="$HF_TOKEN" \
  -p 127.0.0.1:23333:23333 \
  --ipc=host \
  vllm/vllm-openai:latest \
    --model mistralai/Mistral-7B-Instruct-v0.3 \
    --port 23333


##### LMDeploy (check)

docker run --rm \
  --runtime=nvidia --gpus all \
  -v $HOME/.cache/huggingface:/root/.cache/huggingface \
  -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN \
  -p 127.0.0.1:23333:23333 \
  --ipc=host \
  openmmlab/lmdeploy:latest \
    lmdeploy serve api_server mistralai/Mistral-7B-Instruct-v0.3 \
    --server-port 23333                               


##### SGLang (check)

docker run --gpus all \
  -p 127.0.0.1:23333:23333 \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
  --ipc=host \
  lmsysorg/sglang:latest \
  bash -c "\
    pip install --no-cache-dir protobuf sentencepiece && \
    python3 -m sglang.launch_server \
      --model-path mistralai/Mistral-7B-Instruct-v0.3 \
      --host 0.0.0.0 \
      --port 23333 \
  "


##### Deepspeed-MII (check)

docker run --runtime=nvidia --gpus all \
  -v $HOME/.cache/huggingface:/root/.cache/huggingface \
  -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN \
  -p 127.0.0.1:23333:23333 \
  --ipc=host \
  slinusc/deepspeed-mii:latest \
  --model mistralai/Mistral-7B-Instruct-v0.3 \
  --port 23333



In [1]:
from openai import OpenAI

class InferenceEngineClient:
    """
    A simple wrapper for an OpenAI‐compatible server,
    defaulting to Mistral-7B-Instruct-v0.3.
    """

    def __init__(self, base_url="http://localhost:23333/v1", api_key="none"):
        self.client = OpenAI(api_key=api_key, base_url=base_url)
        self.default_model = "mistralai/Mistral-7B-Instruct-v0.3"

    def completion(self,
                   prompt,
                   model: str | None = None,
                   temperature: float = 0.7,
                   max_tokens: int = 512,
                   top_p: float = 0.9,
                   stream: bool = False):
        """
        Send one or more prompts.
        :param prompt: a single string or a list of strings
        :return: if single prompt, returns str; if list, returns List[str]
        """
        model = model or self.default_model
        is_batch = isinstance(prompt, (list, tuple))

        resp = self.client.completions.create(
            model=model,
            prompt=prompt,        # can be str or list[str]
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=top_p,
            stream=stream
        )

        if stream:
            # streaming with batching is a bit more involved; you’ll get interleaved chunks
            return resp

        # non‐streaming: choices is a list with one entry per prompt
        texts = [c.text for c in resp.choices]
        return texts if is_batch else texts[0]

In [1]:
import os, sys

# Insert the parent directory of the current file/notebook
sys.path.insert(0, os.path.abspath(".."))

from benchmark.tasks.qa import QATask
from benchmark.tasks.summarization import SummarizationTask

sum_task = SummarizationTask()
qa_task = QATask()

qa_queries = qa_task.generate_prompts(64)
sum_queries = sum_task.generate_prompts(64)

In [None]:
import os, sys

# Insert the parent directory of the current file/notebook
sys.path.insert(0, os.path.abspath(".."))

from benchmark.inference_engine_client import InferenceEngineClient
model = "mistralai/Mistral-7B-Instruct-v0.3"  # or any other model you want to use
backend = "sglang"  # or "mistral" for Mistral-7B-Instruct-v0.3

cli = InferenceEngineClient()

cli.launch(backend=backend, model_path=model)

batch_results = cli.completion(qa_queries[0], temperature=0.1)

batch_results

['\nLothar de Maizière',
 '\nComplexity classes',
 '\nGTE',
 '\nWater flow through the body cavity',
 '\n12 May 1705',
 '\nIsraeli poet',
 '\nStrengthening the role of backbenchers in their scrutiny of the government.',
 '\nNP-complete',
 '\nca. 2 million',
 '\nDestruction of the forest',
 '\nTwo poles: revolution/invasion and reformist strategy.',
 '\nBoolean circuits',
 '\nDuisburg',
 '\nArticles 106 and 107',
 '\n"Time derivative of the changing momentum"',
 '\nCuba',
 '\nUnderground',
 '\nTrio Tribe',
 '\nOutcome of most votes',
 '\nUsers via leased lines and the public PAD service Telepad',
 '\nLower levels of inequality',
 '\nExodus',
 '\n2001 study found 1 square kilometer (247 acres)',
 '\nSemantical problems and grammatical niceties',
 '\nGranted the Huguenots equality and a degree of religious and political freedom within their domains.',
 '\nIslamists',
 '\nStore and forward switching',
 '\nAlta California',
 '\nPaul Samuelson',
 '\n9%',
 '\nEast-west',
 '\nAn attorney',
 '\

In [6]:
def clean_prediction(prediction: list[str]) -> list[str]:
    cleaned = []
    for raw in prediction:
        # 1) Remove anything after the first '###'
        ans = raw.split("###", 1)[0]

        # 2) Strip whitespace (including newlines) from both ends
        ans = ans.strip()

        # 3) Remove anything after the first newline (in the stripped string)
        ans = ans.split("\n", 1)[0]

        # 4) Strip again and remove any trailing periods
        ans = ans.strip().rstrip(".")

        cleaned.append(ans)
    return cleaned


clean_prediction(batch_results)

['Lothar de Maizière',
 'Complexity classes',
 'GTE',
 'Water flow through the body cavity',
 '12 May 1705',
 'Israeli poet',
 'Strengthening the role of backbenchers in their scrutiny of the government',
 'NP-complete',
 'ca. 2 million',
 'Destruction of the forest',
 'Two poles: revolution/invasion and reformist strategy',
 'Boolean circuits',
 'Duisburg',
 'Articles 106 and 107',
 '"Time derivative of the changing momentum"',
 'Cuba',
 'Underground',
 'Trio Tribe',
 'Outcome of most votes',
 'Users via leased lines and the public PAD service Telepad',
 'Lower levels of inequality',
 'Exodus',
 '2001 study found 1 square kilometer (247 acres)',
 'Semantical problems and grammatical niceties',
 'Granted the Huguenots equality and a degree of religious and political freedom within their domains',
 'Islamists',
 'Store and forward switching',
 'Alta California',
 'Paul Samuelson',
 '9%',
 'East-west',
 'An attorney',
 'A citizen or company can invoke a Directive, not just in a dispute w

In [1]:
from benchmark.benchmark import ModelBenchmark

bm = ModelBenchmark(
    backend="sglang",
    model_path= "mistralai/Mistral-7B-Instruct-v0.3",
    model_name= "Mistral-7B-Instruct-v0.3",
    verbose=False
)

In [2]:
bm.iec.launch(    backend="sglang",
    model_path= "mistralai/Mistral-7B-Instruct-v0.3",)

In [3]:
resp = bm.generate(["What is the capital of France?", "What is the largest mammal?"])

In [4]:
raw, time = resp

In [6]:
from benchmark.utils import clean_prediction
cleaned = clean_prediction(raw)

In [7]:
cleaned

['The capital of France is Paris. It is located in the north-central part of the country. Paris is one of the most famous cities in the world, known for its art, culture, and landmarks such as the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral. It is also the political, economic, and cultural center of France',
 "The blue whale is the largest mammal. It can reach lengths of up to 100 feet (30 meters) and weigh as much as 200 tons (181 metric tonnes). The blue whale's heart alone can weigh as much as a car!"]