# Benchmarking!

In [None]:
%uv pip install vllm

In [None]:
from vllm import LLM, SamplingParams

MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
messages=[{"role":"user","content":"Say hi in 5 words"}]
max_tokens = 128
sp = SamplingParams(temperature=0.3, max_tokens=max_tokens)
# llm = LLM(model=MODEL_ID, dtype="bfloat16", tokenizer_mode="mistral", config_format="mistral", load_format="mistral")
llm = LLM(model=MODEL_ID, dtype="bfloat16", tokenizer_mode="mistral", config_format="mistral", load_format="mistral", max_model_len=2048)


In [None]:
# prompt = messages[-1]["content"] if messages else "Hello"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token  # Just in case

# query = "whoo are you? in 5 words"
messages=[{"role": "system", "content": "You are concise."},{"role":"user","content":"Say hi in 5 words"}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False)

stops = []
sp = SamplingParams(temperature=0.3, max_tokens=128, stop=["[INST]"])
outputs = llm.generate([prompt], sp)
outputs[0].outputs[0].text.lstrip().rstrip()

In [192]:
# release vllm mem from gpu
import gc, torch
# del llm
gc.collect(); torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

# Load tokenizer
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
# if tok.pad_token is None:
#     tok.pad_token = tok.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,   # or "auto" if your GPU supports it
    device_map="auto",            # put it on the available GPU
).eval()

# Example chat messages
messages = [
    {"role": "system", "content": "You are concise."},
    {"role": "user", "content": "Say hi in five words"}
]

# Build the prompt string with the same template HF uses
prompt = tok.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,   # ensures assistant turn starts
)

# Encode
inputs = tok(prompt, return_tensors="pt").to(model.device)

# Generate
with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.3,
        do_sample=True,
        pad_token_id=tok.eos_token_id,
    )

# Decode and strip the prompt if echoed
# full_text = tok.decode(out[0], skip_special_tokens=True)
# completion = full_text[len(prompt):].strip() if full_text.startswith(prompt) else full_text.strip()

# print("HF baseline output:", repr(completion))

In [None]:
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")

In [199]:
tok = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left", truncation_side="left")
tok.pad_token = tok.eos_token

messages = [
    {"role": "system", "content": "You are concise."},
    {"role": "user", "content": "Say hi in five words"}
]

# Build the prompt string with the same template HF uses
input_id = tok.apply_chat_template(
    messages,
    tokenize=True,
    padding=True,
    return_tensors="pt",
    add_generation_prompt=True,   # ensures assistant turn starts
).to(model.device)

# Encode
# inputs = tok(prompt, return_tensors="pt").to(model.device)

# Generate
with torch.no_grad():
    out = model.generate(
        input_id,
        max_new_tokens=128,
        pad_token_id=tok.eos_token_id,
    )

# Decode and strip the prompt if echoed
input_len = input_id.shape[1]
# out, out.shape, input_id, input_id.shape
full_text = tok.decode(out[0,input_len:], skip_special_tokens=True)
full_text
# len(prompt)print("Full text:", repr(full_text))
# print("Prompt length:", len(prompt))
# print("full_text length: ", len(full_text))
# inputs, prompt, full_text
# len(full_text)
# completion = full_text[len(prompt):].strip() if full_text.startswith(prompt) else full_text.strip()

# print("HF baseline output:", repr(completion))

'Hello, there!'

In [172]:
# inputs.input_ids.shape[1]
out.shape, input_id.shape

(torch.Size([1, 21]), torch.Size([1, 16]))

In [None]:
tok = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left", truncation_side="left")
tok.pad_token = tok.eos_token

queries = [
    "Say hi in five words",
    "What's the capital of France?",
    "Tell me a joke",
    "Explain quantum physics briefly"
]

batch = [
    [{"role": "system", "content": "You are concise."},
    {"role": "user", "content": query}] for query in queries
]

# Build the prompt string with the same template HF uses
input_id = tok.apply_chat_template(
    batch,
    tokenize=True,
    padding=True,
    return_tensors="pt",
    add_generation_prompt=True,   # ensures assistant turn starts
).to(model.device)

input_id, input_id.shape

# Generate
with torch.no_grad():
    out = model.generate(
        input_id,
        max_new_tokens=128,
        pad_token_id=tok.eos_token_id,
    )

# Decode and strip the prompt if echoed
input_len = input_id.shape[1]
# out, out.shape, input_id, input_id.shape
full_text = tok.decode(out[:,input_len:], skip_special_tokens=True)
full_text