In [None]:
import time
from openvino_genai import LLMPipeline, GenerationConfig
from transformers import AutoTokenizer

MODEL_DIR = r"model-path"

In [None]:
pipe = LLMPipeline(MODEL_DIR, device="GPU")

In [None]:
gen = GenerationConfig(
    max_new_tokens=256,
    temperature=0.7,
    apply_chat_template=False,   # <-- as you set
)

prompt = "Give me 5 quick facts about eigenvalues."

# Load tokenizer from the same model folder to count tokens accurately
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)


In [None]:

# --- streaming, but chunks are characters ---
pieces = []
t_start = time.perf_counter()
t_first = None

for chunk in pipe.generate(prompt, gen):
    if t_first is None:
        t_first = time.perf_counter()   # time to first *character* received
    # Chunk may be a tiny object or just a char; normalize to a string:
    s = getattr(chunk, "token", getattr(chunk, "text", str(chunk)))
    pieces.append(s)

t_end = time.perf_counter()
decoded_text = "".join(pieces)

# --- true token accounting (model tokens, not characters) ---
prompt_ids = tokenizer(prompt, add_special_tokens=False).input_ids
full_ids   = tokenizer(prompt + decoded_text, add_special_tokens=False).input_ids

prompt_tokens = len(prompt_ids)
new_tokens    = len(full_ids) - prompt_tokens

# --- timings / throughput ---
total_time   = t_end - t_start
decode_time  = (t_end - t_first) if t_first else total_time  # time after first char arrived
ttft         = (t_first - t_start) if t_first else None

tps_total  = (new_tokens / total_time)  if total_time  > 0 else float("inf")
tps_decode = (new_tokens / decode_time) if decode_time > 0 else float("inf")

print(decoded_text)
print("\n--- Stats ---")
print(f"Prompt tokens: {prompt_tokens}")
print(f"New tokens:    {new_tokens}")

print(f"Total time:     {total_time:.3f}s   | Tokens/sec (total):  {tps_total:.2f}")
