In [None]:
!python3.11 -m pip install --upgrade pip setuptools wheel

!python3.11 -m pip install transformers datasets bert-score llama-cpp-python peft trl

import sys
!{sys.executable} -m pip install bert_score llama-cpp-python

!{sys.executable} -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 

!pip show transformers
!pip show trl
!pip show peft

!pip install ipywidgets

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, get_peft_model, PeftModel

In [None]:
from huggingface_hub import login
login(token="----") # Put in your Hugging Face Token here

#### Load Base model

In [3]:
# Instruct model: meta-llama/Llama-3.2-3B-Instruct
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    dtype=torch.float16,
    low_cpu_mem_usage=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
base_model.save_pretrained("./base_instruct_model")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.save_pretrained("./base_instruct_model")

('./base_instruct_model/tokenizer_config.json',
 './base_instruct_model/special_tokens_map.json',
 './base_instruct_model/chat_template.jinja',
 './base_instruct_model/tokenizer.json')

In [6]:
# Convert instruct model to GGUF - this will be the baseline comparison
!python llama.cpp/convert_hf_to_gguf.py ./base_instruct_model --outtype f16 --outfile ./quantized_models/fp16-ft-base.gguf

INFO:hf-to-gguf:Loading model: base_instruct_model
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {3072, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.attn_k

#### Clone llama.cpp to quantize the models

In [46]:
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

fatal: destination path 'llama.cpp' already exists and is not an empty directory.
Already up to date.
Makefile:6: *** Build system changed:
 The Makefile build has been replaced by CMake.

 For build instructions see:
 https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md

.  Stop.
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly
Collecting git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview (from -r llama.cpp/./requirements/requirements-convert_legacy_llama.txt (line 8))
  Cloning https://github.com/huggingface/transformers (to revision v4.56.0-Embedding-Gemma-preview) to /tmp/pip-req-build-nz6yja8m
  Running command git clone --filter=blob

In [13]:
# GPU BUILT
!rm -rf llama.cpp/build

!cmake -DLLAMA_CURL=OFF -DGGML_CUDA=ON -DGGML_OPENMP=ON -B llama.cpp/build -S llama.cpp \
    -DCMAKE_BUILD_TYPE=Release \
    -DCMAKE_C_FLAGS="-fopenmp" \
    -DCMAKE_CXX_FLAGS="-fopenmp" \
    -DCMAKE_EXE_LINKER_FLAGS="-fopenmp"

!cmake --build llama.cpp/build --config Release -j

#!cmake -DGGML_OPENMP=OFF -DLLAMA_CURL=OFF -B llama.cpp/build -S llama.cpp
#!cmake --build llama.cpp/build --config Release

-- The C compiler identification is GNU 11.5.0
-- The CXX compiler identification is GNU 11.5.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.47.3")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found ve

In [10]:
# CPU BUILT
!rm -rf llama.cpp/build
!cmake -DGGML_OPENMP=OFF -DLLAMA_CURL=OFF -B llama.cpp/build -S llama.cpp
!cmake --build llama.cpp/build --config Release

-- The C compiler identification is GNU 11.5.0
-- The CXX compiler identification is GNU 11.5.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.47.3")
-- The ASM compiler identification is GNU
-- Found assembler: /usr/bin/cc
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- ggml version: 0.9.4
-- ggml commit:  d2ee056e
-- Configurin

#### QLoRA

In [None]:
HF_TOKEN ="-----" # Put in your Hugging Face Token here

In [5]:
#Load fine-tuned model using the PEFT library
model = PeftModel.from_pretrained(base_model,"sharshar20/career-advisory-qlora-llama3.2-3b-instruct-v7", trust_remote_code=True)
#3. Merge the model weights
model = model.merge_and_unload()

In [6]:
#Save the merged model and its tokenizer
model.save_pretrained("./merged_model_instruct_qlora")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.save_pretrained("./merged_model_instruct_qlora")

SafetensorError: Error while serializing: I/O error: No space left on device (os error 28)

In [8]:
# Push the merged model directory
model.push_to_hub("sharshar20/qlora-and-instruct-merged-model-v7",
                      commit_message = "model_Mergin QLoRA and Instruct model v7",
                      token = HF_TOKEN,
                      private=False)
# Push the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.push_to_hub("sharshar20/qlora-and-instruct-merged-model-v7",
                      commit_message = "token_Mergin QLoRA and Instruct model v7",
                      token = HF_TOKEN,
                      private=False)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/sharshar20/qlora-and-instruct-merged-model-v7/commit/63ca988a213869eb09ceba03025bd486edd01feb', commit_message='token_Mergin QLoRA and Instruct model v7', commit_description='', oid='63ca988a213869eb09ceba03025bd486edd01feb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sharshar20/qlora-and-instruct-merged-model-v7', endpoint='https://huggingface.co', repo_type='model', repo_id='sharshar20/qlora-and-instruct-merged-model-v7'), pr_revision=None, pr_num=None)

In [None]:
# Convert qlora to fp16
#!python llama.cpp/convert_hf_to_gguf.py ./merged_model_instruct_qlora --outtype f16 --outfile ./quantized_models/fp16-ft-qlora.gguf
!python3 llama.cpp/convert_hf_to_gguf.py sharshar20/qlora-and-instruct-merged-model-v7 --remote \
  --outfile ./quantized_models/fp16-ft-qlora.gguf \
  --outtype f16

Fetching 7 files: 100%|██████████████████████████| 7/7 [00:00<00:00, 458.99it/s]
INFO:hf-to-gguf:Downloaded config and tokenizer to /home/FYP/shar0097/.cache/huggingface/hub/models--sharshar20--qlora-and-instruct-merged-model-v7/snapshots/63ca988a213869eb09ceba03025bd486edd01feb
INFO:hf-to-gguf:Loading model: 63ca988a213869eb09ceba03025bd486edd01feb
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:Using remote model with HuggingFace id: sharshar20/qlora-and-instruct-merged-model-v7
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       

In [None]:
!./llama.cpp/build/bin/llama-quantize ./quantized_models/fp16-ft-qlora.gguf \
    --outfile ./quantized_models/ft-qlora-q4_k_m.gguf \
    --outtype Q4_K_M

#### LoRA

In [4]:
#2. Load fine-tuned model using the PEFT library
model = PeftModel.from_pretrained(base_model,"sharshar20/career-advisory-lora-llama3.2-3b-instruct-v7", trust_remote_code=True)
#3. Merge the model weights
model = model.merge_and_unload()

In [34]:
#4. Save the merged model and its tokenizer
model.save_pretrained("./merged_model_instruct_lora")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.save_pretrained("./merged_model_instruct_lora")

('./merged_model_instruct_lora/tokenizer_config.json',
 './merged_model_instruct_lora/special_tokens_map.json',
 './merged_model_instruct_lora/chat_template.jinja',
 './merged_model_instruct_lora/tokenizer.json')

In [None]:
HF_TOKEN = "-----" # Put in your Hugging Face Token here

In [7]:
# Push the merged model directory
model.push_to_hub("sharshar20/lora-and-instruct-merged-model-v8",
                      commit_message = "model_Mergin LoRA and Instruct model v7",
                      token = HF_TOKEN,
                      private=False)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
# Push the tokenizer
tokenizer.push_to_hub("sharshar20/lora-and-instruct-merged-model-v8",
                      commit_message = "token_Mergin LoRA and Instruct model v7",
                      token = HF_TOKEN,
                      private=False)

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/sharshar20/lora-and-instruct-merged-model-v8/commit/73726fa7ef0ee6120f23358531a4862b42133839', commit_message='token_Mergin LoRA and Instruct model v7', commit_description='', oid='73726fa7ef0ee6120f23358531a4862b42133839', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sharshar20/lora-and-instruct-merged-model-v8', endpoint='https://huggingface.co', repo_type='model', repo_id='sharshar20/lora-and-instruct-merged-model-v8'), pr_revision=None, pr_num=None)

In [8]:
QUANTIZATION_METHODS = ["q4_k_m", "q5_k_m", "q8_0"]

In [11]:
# Convert to fp16
!python3 llama.cpp/convert_hf_to_gguf.py sharshar20/lora-and-instruct-merged-model-v8 --remote \
  --outfile ./quantized_models/fp16-ft-lora.gguf \
  --outtype f16

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    !./llama.cpp/build/bin/llama-quantize ./quantized_models/fp16-ft-lora.gguf ./quantized_models/ft-{method}-lora.gguf {method}

Fetching 7 files: 100%|█████████████████████████| 7/7 [00:00<00:00, 4407.77it/s]
INFO:hf-to-gguf:Downloaded config and tokenizer to /home/FYP/shar0097/.cache/huggingface/hub/models--sharshar20--lora-and-instruct-merged-model-v8/snapshots/73726fa7ef0ee6120f23358531a4862b42133839
INFO:hf-to-gguf:Loading model: 73726fa7ef0ee6120f23358531a4862b42133839
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:Using remote model with HuggingFace id: sharshar20/lora-and-instruct-merged-model-v8
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {3072, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {3072}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {8192, 3072}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       to

In [79]:
model_list = [f for f in os.listdir("quantized_models") if f.endswith(".gguf")]

prompt = input("Enter your prompt: ")
chosen_method = input("Name of the model (options: " + ", ".join(model_list) + "): ")

# Verify the chosen method is in the list
if chosen_method not in model_list:
    print("Invalid name")
else:
    command = f'./llama.cpp/build/bin/llama-cli -m ./quantized_models/{chosen_method} -n 128 --color -ngl 35 -p "{prompt}"'
    !{command}

Enter your prompt:  I have a degree in computer engineering and have skills in data analysis. What are the possible career pathways for me?
Name of the model (options: fp16-ft.gguf, ft-q4_k_m.gguf, ft-q5_k_m.gguf):  fp16-ft.gguf


build: 6713 (d2ee056e) with x86_64-conda-linux-gnu-cc (Anaconda gcc) 11.2.0 for x86_64-conda-linux-gnu
main: llama backend init
main: load the model and apply lora adapter, if any
llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from ./quantized_models/fp16-ft.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Lora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u

In [12]:
#!pip install -q huggingface_hub
from huggingface_hub import create_repo, HfApi

api = HfApi(token=HF_TOKEN)

# Upload gguf files
api.upload_folder(
    folder_path="quantized_models",
    repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v7",
    allow_patterns=f"*.gguf",
)

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/sharshar20/llama3.2_3B_instruct-GGUF-v7/commit/0e1758a8288f04794914b7504db3201258b6a2f9', commit_message='Upload folder using huggingface_hub', commit_description='', oid='0e1758a8288f04794914b7504db3201258b6a2f9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sharshar20/llama3.2_3B_instruct-GGUF-v7', endpoint='https://huggingface.co', repo_type='model', repo_id='sharshar20/llama3.2_3B_instruct-GGUF-v7'), pr_revision=None, pr_num=None)

##### Evaluations

In [12]:
# Get the test tokenized dataset
from datasets import load_from_disk

# Load the tokenized dataset from the specified directory
test_dataset = load_from_disk("./dataset/split_combined_dataset_withoutjd/test")
print(test_dataset)

small_test = test_dataset.shuffle(seed=42).select(range(50))     

Dataset({
    features: ['input_text', 'target_text'],
    num_rows: 454
})


In [11]:
import torch
from llama_cpp import Llama
from bert_score import BERTScorer
from tqdm import tqdm
from transformers import AutoTokenizer

def evaluate_llama_bertscore_withChatTemplate(
    model_path,
    dataset,
    tokenizer,
    max_tokens=128,
    temperature=0.7,
    top_p=0.9,
    repeat_penalty=1.1,
    batch_size=32,
    n_gpu_layers=-1,
    n_threads=8,
    scorer_model="bert-base-uncased"
):

    # Load LLaMA model
    model = Llama(
        model_path=model_path,
        n_ctx=1024,
        n_gpu_layers=n_gpu_layers,
        n_threads=n_threads,
        verbose=False
    )
    
    predictions, references = [], []
    
    for sample in tqdm(dataset, desc="Generating responses", total=len(dataset)):
        user_message = sample["input_text"]
        reference = sample["target_text"]

        # Apply the same chat template used during training
        chat_prompt = tokenizer.apply_chat_template(
            [{"role": "user", "content": user_message}],
            tokenize=False,
            add_generation_prompt=True  # tells model it's the assistant's turn
        )

        try:
            output = model(
                chat_prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                repeat_penalty=repeat_penalty
            )
            pred = output["choices"][0]["text"].strip()
        except Exception as e:
            print(f"⚠️ Generation error: {e}")
            pred = ""
        
        predictions.append(pred)
        references.append(reference)

    # --- Compute BERTScore ---
    device = "cuda" if torch.cuda.is_available() else "cpu"
    scorer = BERTScorer(model_type=scorer_model, device=device, batch_size=batch_size)
    P, R, F1 = scorer.score(predictions, references)
    
    return {
        "predictions": predictions,
        "references": references,
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }

In [5]:
import torch
from llama_cpp import Llama
from bert_score import BERTScorer
from tqdm import tqdm
import pandas as pd

def evaluate_llama_bertscore(
    model_path,
    dataset,
    max_tokens=128,
    temperature=0.7,
    top_p=0.9,
    repeat_penalty=1.1,
    batch_size=32,
    n_gpu_layers=-1,
    n_threads=8,
    scorer_model="bert-base-uncased"
):

    model = Llama(
        model_path=model_path,
        n_ctx=1024,
        n_gpu_layers=n_gpu_layers,
        n_threads=n_threads,
        verbose=False
    )
    
    predictions = []
    references = []
    
    for sample in tqdm(dataset, desc="Generating responses", total=len(dataset)):
        prompt = sample["input_text"]
        reference = sample["target_text"]
        try:
            output = model(
                prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                repeat_penalty=repeat_penalty
            )
            pred = output["choices"][0]["text"].strip()
        except Exception as e:
            print(f"⚠️ Generation error: {e}")
            pred = ""
        predictions.append(pred)
        references.append(reference)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    scorer = BERTScorer(model_type=scorer_model, device=device, batch_size=batch_size)
    P, R, F1 = scorer.score(predictions, references)
    
    precision = P.mean().item()
    recall = R.mean().item()
    f1 = F1.mean().item()
    
    return {
        "predictions": predictions,
        "references": references,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [3]:
!ls ./quantized_models

fp16-ft-base.gguf   ft-q4_k_m-lora.gguf  ft-qlora-q4_k_m.gguf
fp16-ft-lora.gguf   ft-q5_k_m-lora.gguf
fp16-ft-qlora.gguf  ft-q8_0-lora.gguf


In [6]:
# fp16 model - lora and instruct merged model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
results = evaluate_llama_bertscore_withChatTemplate(model_path="./quantized_models/fp16-ft-lora.gguf", dataset=small_test, tokenizer=tokenizer)
print(results)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
Generating responses: 100%|██████████| 50/50 [54:59<00:00, 65.98s/it]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

{'predictions': ["?\n\nThe possible positions for B.E. (Computers) are: Data Science Engineer with an experience requirement of At least 5 years. The related skills are NLP, Web Scraping, bs4, LDA Topic Modeling, Spack, MongoDB and Python for data scraping and analysis. The related qualifications are Bachelor's Degree in Engineering. The experience requirements are At Least 1 year. The related skills are API Documentation, Client Code, Server Code, Integations, Test Cases, Repository Management, Task Assignment, Performance Monitoring, Skills Translation, User Protection, Data Integrity, System Security, Compliance, Data Backup, Infrastructure Design,", '?\nThe possible positions for Bachelor of Business Administration are: Mechanical Engineer with an experience requirement of 2 to 5 years. The related skills are Creating dashboards for sales and profits, Establishing key performance indicators (KPIs), Analyzing data to inform business decisions, Designing reports to help stakeholders 

In [6]:
# LoRA Q4_K_M gguf model bertscore
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
results = evaluate_llama_bertscore_withChatTemplate(model_path="./quantized_models/ft-q4_k_m-lora.gguf", dataset=small_test, tokenizer=tokenizer)
print(results)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
Generating responses: 100%|██████████| 50/50 [17:00<00:00, 20.41s/it]


{'predictions': ['The possible positions for B.E. (Computers) are: Business Development Executive with an experience requirement of At least 1 year. The related skills are NLP, ML and Amazon Comprehend Medical API. The possible positions for NLP, ML and Amazon Comprehend Medical API are: Data Science Engineer with an experience requirement of At least 1 year. The related skills are NLP, ML and Amazon Comprehend Medical API. The possible positions for NLP, ML and Amazon Comprehend Medical API are: Mechanical Engineer with an experience requirement of At least 1 year. The related skills are NLP, ML', 'The possible positions for Bachelor of Business Administration are: Marketing Officer with an experience requirement of At least 1 year. The related skills are NLP, Machine learning and Amazon Comprehend Medical API. The interview process involves Screening, Zoom meeting, In-person interview and Candidate research. The average salary for this position is between $50,000 and $70,000.', "Entr

In [12]:
# LoRA Q5_K_M gguf model bertscore
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
results = evaluate_llama_bertscore_withChatTemplate(model_path="./quantized_models/ft-q5_k_m-lora.gguf", dataset=small_test, tokenizer=tokenizer)
print(results)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
Generating responses: 100%|██████████| 50/50 [21:17<00:00, 25.55s/it]


{'predictions': ['The possible positions for B.E. (Computers) are: Marketing Officer with an experience requirement of At least 1 year. The related skills are Testing, Debugging and Parameter Tuning, Automation, API Integration, LDA Modeling, Data Cleaning, Data Transformation, Predictive Analysis, Machine Learning, Statistical Analysis, Risk Assessment, Decision Support, Operational Excellence, System Administration, Server Management, Data Security, Access Control, Mail Server Management, Linux Operating System, Network Architecture, Disaster Recovery, Backup & Restore, Infrastructure Design, Team Collaboration, Customer Communication, Customer Engagement, User Support, Application Development, Troubleshooting, Field Service,', 'The possible positions for Bachelor of Business Administration are: Marketing Officer with an experience requirement of 1 to 3 years. The related skills are NLP, ML and Amazon Comprehend Medical API. The related qualifications are Degree in Engineering. The i

In [13]:
# LoRA Q5_K_M gguf model bertscore
results = evaluate_llama_bertscore(model_path="./quantized_models/ft-q5_k_m-lora.gguf", dataset=small_test,)
print(results)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
Generating responses: 100%|██████████| 50/50 [30:17<00:00, 36.34s/it]


{'predictions': ['The possible positions for B.E. (Computers) are: Network Support Engineer with an experience requirement of At least 1 year. The related skills are Electrical and Mechanical Equipment, System Analysis, Decision Making, Data Communication, Transducer/Converter, Microcomputer/Computer, Electrical/Mechanical Test, Customer Service, Field Technical Services, Repair, Diagnostics, Documentation, Operations, Management, Teamwork, Product Sales, Transfer, Sales, Customer, Error Reporting, Replacement, Troubleshooting, Parts Inventory, Payroll, Account, Field Visits, Debugging, Data Entry, File Management, Supervisory Assistance, Testing,', "? The possible positions for Bachelor of Business Administration are: Mechanical Engineer with an experience requirement of 5 to 8 years. The related skills are Software Development and Application Programming. The related qualifications are Bachelor's Degree. The possible positions for Bachelor's Degree are: Data Science Engineer. The exp

In [7]:
# fp16 model - instruct base model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
results = evaluate_llama_bertscore_withChatTemplate(model_path="./quantized_models/fp16-ft-base.gguf", dataset=small_test, tokenizer=tokenizer)
print(results)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
Generating responses: 100%|██████████| 50/50 [56:11<00:00, 67.42s/it]


{'predictions': ['A Bachelor of Engineering (B.E.) in Computers can lead to a wide range of career opportunities. Here are some positions you can consider:\n\n1. **Software Engineer**: Design, develop, and test software applications for various industries, including gaming, finance, healthcare, and more.\n2. **IT Project Manager**: Oversee the planning, execution, and delivery of IT projects, ensuring they meet deadlines and budget requirements.\n3. **Network Administrator**: Install, configure, and maintain computer networks, ensuring secure and efficient data transfer.\n4. **Database Administrator**: Design, implement, and manage databases to store and retrieve data for various organizations.', 'A Bachelor of Business Administration (BBA) degree is a versatile degree that can lead to various career opportunities across different industries. Here are some positions you can apply for with a BBA degree:\n\n1. **Management Trainee**: Many companies offer management trainee programs for r

In [8]:
# QLoRA Q4_K_M gguf model bertscore
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
results = evaluate_llama_bertscore_withChatTemplate(model_path="./quantized_models/ft-qlora-q4_k_m.gguf", dataset=small_test, tokenizer=tokenizer)
print(results)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
Generating responses: 100%|██████████| 50/50 [18:16<00:00, 21.94s/it]


{'predictions': ['The possible positions for B.E. (Computers) are: Marketing Manager - E-commerce/ Retail. The related skills are: NLP, Machine Learning, Deep learning, Python, R, TensorFlow, Keras, PyTorch, Scikit-learn. The related tools are: Keras, TensorFlow, PyTorch, Scikit-learn, NLTK, spaCy, pandas, NumPy, Matplotlib. The related technologies are: NLP, Machine Learning, Deep learning, Python, R, TensorFlow, Keras, PyTorch, Scikit-learn. The related software are: Jupyter Notebook, Google', 'The possible positions for Bachelor of Business Administration are: Sales Consultant. The related skills are Customer Relationship Building, Lead Generation & Execution, Client Communication, Market Research and Data Analysis. The likely responsibilities include Sales Strategy Development, Business Planning, Performance Tracking, Collaboration with Teams, Stakeholder Management, Team Leadership, Strategic Decision Making, Risk Assessment, Budgeting, Marketing Coordination, Operations Oversight

In [9]:
# LoRA Q8_0 gguf model bertscore
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
results = evaluate_llama_bertscore_withChatTemplate(model_path="./quantized_models/ft-q8_0-lora.gguf", dataset=small_test, tokenizer=tokenizer)
print(results)

llama_context: n_ctx_per_seq (1024) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
Generating responses: 100%|██████████| 50/50 [23:07<00:00, 27.75s/it]


{'predictions': ["The possible positions for B.E. (Computers) are: Manager- Human Resource Management (HRM)\n with an experience requirement of 5 to 6 years. The related skills are Market Research and Data Analysis. The related qualifications are Bachelor's Degree in Business Administration. The possible positions for Bachelor's Degree in Business Administration are: Senior Software Engineer. The experience requirement is 4 to 5 years. The related skills are Software Development, System Analysis, Requirement Gathering, Designing Database Schemes, Developing SQL Queries, Performance Monitoring, Troubleshooting, Document Preparation, Code Translation, Communication, Team Collaboration, Task Management, Innovation,", 'The possible positions for Bachelor of Business Administration are: Project Coordinator (Civil) with an experience requirement of 5 to 10 years. The related skills are NLP and Data Analysis. The possible positions for NLP and Data Analysis are: Machine Learning (ML) Engineer

In [6]:
!nvidia-smi

Tue Oct 14 12:42:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.57.08              Driver Version: 575.57.08      CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-PCIE-32GB           Off |   00000000:D8:00.0 Off |                    0 |
| N/A   34C    P0             35W /  250W |       0MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
import psutil
import platform
import os
from llama_cpp import Llama

In [None]:
!llama.cpp/build/bin/llama-cli --list-devices

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5, VMM: yes
Available devices:
  CUDA0: Tesla T4 (15095 MiB, 14992 MiB free)


#### llama-bench results

In [6]:
!llama.cpp/build/bin/llama-bench --flash-attn 1 --model ./quantized_models/ft-q4_k_m-lora.gguf

| model                          |       size |     params | backend    | threads | fa |            test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -: | --------------: | -------------------: |
| llama 3B Q4_K - Medium         |   1.87 GiB |     3.21 B | CPU        |      36 |  1 |           pp512 |          4.38 ± 0.02 |
^C


In [7]:
!llama.cpp/build/bin/llama-bench --flash-attn 1 --model ./quantized_models/ft-q5_k_m-lora.gguf

| model                          |       size |     params | backend    | threads | fa |            test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -: | --------------: | -------------------: |
^C


In [8]:
!llama.cpp/build/bin/llama-bench --flash-attn 1 --model ./quantized_models/fp16-ft-base.gguf

| model                          |       size |     params | backend    | threads | fa |            test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -: | --------------: | -------------------: |
^C


In [9]:
!llama.cpp/build/bin/llama-bench --flash-attn 1 --model ./quantized_models/ft-q8_0-lora.gguf

| model                          |       size |     params | backend    | threads | fa |            test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -: | --------------: | -------------------: |
^C


In [10]:
!llama.cpp/build/bin/llama-bench --flash-attn 1 --model ./quantized_models/ft-qlora-q4_k_m.gguf

| model                          |       size |     params | backend    | threads | fa |            test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | -: | --------------: | -------------------: |
^C


In [None]:
!apt-get update
!apt-get install -y build-essential cmake git libopenblas-dev



0% [Working]
            
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]

            
Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,931 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,209 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease


#### Perplexity

In [14]:
# Load your dataset
split_dataset = load_from_disk("dataset/split_combined_dataset_withoutjd")

train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

In [37]:
! ls ./quantized_models

fp16-ft-base.gguf  fp16-ft-qlora.gguf	ft-q8_0-lora.gguf
fp16-ft.gguf	   ft-q4_k_m-lora.gguf	ft-qlora-q4_k_m.gguf
fp16-ft-lora.gguf  ft-q5_k_m-lora.gguf


In [15]:
texts = [
    f"User: {inp}\nAssistant: {tgt}"
    for inp, tgt in zip(test_dataset["input_text"], test_dataset["target_text"])
]

# Save to text file
with open("test_dataset.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(texts))

In [16]:
with open("test_dataset.txt", "r", encoding="utf-8") as f:
    print(f.read())

User: What are the entry-level requirements for a Legal Assistant?
Assistant: Entry-level requirements typically include a Bachelor’s degree or completion of a legal assistant certification program. Some positions may accept candidates with relevant experience or internships in the legal field.

User: What is the career growth path for an Automation Engineer?
Assistant: The career growth path typically starts with an entry-level position as an Automation Engineer, progressing to Senior Automation Engineer, Automation Architect, and eventually to roles like DevOps Engineer or Automation Manager.

User: What are the entry-level requirements for a Web Developer?
Assistant: Entry-level requirements typically include knowledge of HTML, CSS, JavaScript, and basic understanding of web development frameworks. Building a portfolio of projects can also help you get started.

User: What skills are required to become a Legal Assistant?
Assistant: Skills required include strong organizational abili

In [13]:
!./llama.cpp/build/bin/llama-perplexity -m ./quantized_models/ft-q4_k_m-lora.gguf -f test_dataset.txt

build: 6713 (d2ee056e) with cc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) for x86_64-redhat-linux
llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from ./quantized_models/ft-q4_k_m-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = sharshar20/lora-and-instruct-merged-m...
llama_model_loader: - kv   3:                           general.finetune str              = 73726fa7ef0ee6120f23358531a4862b42133839
llama_model_loader: - kv   4:                         general.size_label str              = 3.2B
llama_model_loader: - kv   5:                          llama.block_count u32              = 28
llam

In [14]:
!./llama.cpp/build/bin/llama-perplexity -m ./quantized_models/ft-q5_k_m-lora.gguf -f test_dataset.txt

build: 6713 (d2ee056e) with cc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) for x86_64-redhat-linux
llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from ./quantized_models/ft-q5_k_m-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = sharshar20/lora-and-instruct-merged-m...
llama_model_loader: - kv   3:                           general.finetune str              = 73726fa7ef0ee6120f23358531a4862b42133839
llama_model_loader: - kv   4:                         general.size_label str              = 3.2B
llama_model_loader: - kv   5:                          llama.block_count u32              = 28
llam

In [15]:
!./llama.cpp/build/bin/llama-perplexity -m ./quantized_models/ft-q8_0-lora.gguf -f test_dataset.txt

build: 6713 (d2ee056e) with cc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) for x86_64-redhat-linux
llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from ./quantized_models/ft-q8_0-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = sharshar20/lora-and-instruct-merged-m...
llama_model_loader: - kv   3:                           general.finetune str              = 73726fa7ef0ee6120f23358531a4862b42133839
llama_model_loader: - kv   4:                         general.size_label str              = 3.2B
llama_model_loader: - kv   5:                          llama.block_count u32              = 28
llama_

In [16]:
!./llama.cpp/build/bin/llama-perplexity -m ./quantized_models/fp16-ft-base.gguf -f test_dataset.txt

build: 6713 (d2ee056e) with cc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) for x86_64-redhat-linux
llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from ./quantized_models/fp16-ft-base.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Base_Instruct_Model
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embed

In [17]:
!./llama.cpp/build/bin/llama-perplexity -m ./quantized_models/fp16-ft-lora.gguf -f test_dataset.txt

build: 6713 (d2ee056e) with cc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) for x86_64-redhat-linux
llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from ./quantized_models/fp16-ft-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = sharshar20/lora-and-instruct-merged-m...
llama_model_loader: - kv   3:                           general.finetune str              = 73726fa7ef0ee6120f23358531a4862b42133839
llama_model_loader: - kv   4:                         general.size_label str              = 3.2B
llama_model_loader: - kv   5:                          llama.block_count u32              = 28
llama_

In [18]:
!./llama.cpp/build/bin/llama-perplexity -m ./quantized_models/ft-qlora-q4_k_m.gguf -f test_dataset.txt

build: 6713 (d2ee056e) with cc (GCC) 11.5.0 20240719 (Red Hat 11.5.0-5) for x86_64-redhat-linux
llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from ./quantized_models/ft-qlora-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = sharshar20/qlora-and-instruct-merged-...
llama_model_loader: - kv   3:                           general.finetune str              = 63ca988a213869eb09ceba03025bd486edd01feb
llama_model_loader: - kv   4:                         general.size_label str              = 3.2B
llama_model_loader: - kv   5:                          llama.block_count u32              = 28
lla

#### TTFT and TPOT

In [19]:
test_dataset[0]

{'input_text': 'What are the entry-level requirements for a Legal Assistant?',
 'target_text': 'Entry-level requirements typically include a Bachelor’s degree or completion of a legal assistant certification program. Some positions may accept candidates with relevant experience or internships in the legal field.'}

In [17]:
import time
import numpy as np
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from tqdm import tqdm

def evaluate_llama_latency(repo_id, filename,test_dataset,cache_dir="latency", n_ctx=2048,n_gpu_layers=-1,n_threads=8,max_tokens=50, num_runs=10,use_mlock=True,):
    # Download Model
    model_path = hf_hub_download(
        repo_id=repo_id,
        filename=filename,
        cache_dir=cache_dir
    )

    # Load Model
    start_load = time.time()
    llm = Llama(
        model_path=model_path,
        n_ctx=n_ctx,
        n_threads=n_threads,
        #n_gpu_layers=n_gpu_layers,
        use_mlock=use_mlock
    )
    load_time = time.time() - start_load

    # Measure Latency
    ttft_list = []
    token_latency_list = []

    for i in tqdm(range(num_runs), desc="Evaluating prompts"):
        prompt = test_dataset[i % len(test_dataset)]["input_text"]

        start_time = time.time()
        first_token_time = None
        prev_time = None
        token_times = []
        output_tokens = []

        for idx, token in enumerate(llm(prompt, max_tokens=max_tokens, stream=True)):
            now = time.time()
            if idx == 0:
                first_token_time = now - start_time
                prev_time = now
            else:
                token_times.append(now - prev_time)
                prev_time = now
            output_tokens.append(token["choices"][0]["text"])

        # Store metrics
        ttft_list.append(first_token_time * 1000)
        token_latency_list.extend([t * 1000 for t in token_times])

        tpot = np.mean([t * 1000 for t in token_times]) if token_times else 0
        e2e_latency = (time.time() - start_time) * 1000
        formula_e2e = first_token_time * 1000 + tpot * len(token_times)

        print(f"\nPrompt {i+1}:")
        print(f"TTFT: {first_token_time*1000:.2f} ms")
        print(f"TPOT: {tpot:.2f} ms/token")
        print(f"E2E Measured: {e2e_latency:.2f} ms")
        print(f"E2E Formula:  {formula_e2e:.2f} ms")

    # --- Compute Aggregate Statistics ---
    avg_ttft = np.mean(ttft_list)
    avg_token_latency = np.mean(token_latency_list)
    p50, p95, p99 = np.percentile(token_latency_list, [50, 95, 99])

    results = {
        "avg_ttft_ms": avg_ttft,
        "avg_tpot_ms": avg_token_latency,
        "p50_ms": p50,
        "p95_ms": p95,
        "p99_ms": p99,
        "load_time_s": load_time
    }

    print("\n--- Overall Token Latency Stats ---")
    print(f"Average TTFT: {avg_ttft:.2f} ms")
    print(f"Average TPOT: {avg_token_latency:.2f} ms")
    print(f"P50: {p50:.2f} ms, P95: {p95:.2f} ms, P99: {p99:.2f} ms")

    return results

In [26]:
results = evaluate_llama_latency(repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v6",filename="fp16-ft-base.gguf", test_dataset=test_dataset)
print(results)

fp16-ft-base.gguf:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from latency/models--sharshar20--llama3.2_3B_instruct-GGUF-v6/snapshots/6404c56ca2726fd54b86b260c71365a86b1bbd95/fp16-ft-base.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Base_Instruct_Model
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_le


Prompt 1:
TTFT: 1691.41 ms
TPOT: 494.06 ms/token
E2E Measured: 26394.58 ms
E2E Formula:  26394.45 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1562.40 ms /    10 tokens (  156.24 ms per token,     6.40 tokens per second)
llama_perf_context_print:        eval time =   24645.57 ms /    49 runs   (  502.97 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26259.40 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  20%|██        | 2/10 [00:52<03:30, 26.32s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 2:
TTFT: 1563.79 ms
TPOT: 493.97 ms/token
E2E Measured: 26262.21 ms
E2E Formula:  26262.12 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1559.01 ms /    10 tokens (  155.90 ms per token,     6.41 tokens per second)
llama_perf_context_print:        eval time =   24641.85 ms /    49 runs   (  502.89 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26252.35 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  30%|███       | 3/10 [01:18<03:04, 26.29s/it]Llama.generate: 2 prefix-match hit, remaining 9 prompt tokens to eval



Prompt 3:
TTFT: 1560.39 ms
TPOT: 493.90 ms/token
E2E Measured: 26255.47 ms
E2E Formula:  26255.36 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1495.86 ms /     9 tokens (  166.21 ms per token,     6.02 tokens per second)
llama_perf_context_print:        eval time =   24692.54 ms /    49 runs   (  503.93 ms per token,     1.98 tokens per second)
llama_perf_context_print:       total time =   26239.73 ms /    58 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  40%|████      | 4/10 [01:45<02:37, 26.27s/it]Llama.generate: 2 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 4:
TTFT: 1497.26 ms
TPOT: 494.90 ms/token
E2E Measured: 26242.51 ms
E2E Formula:  26242.41 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1748.00 ms /    13 tokens (  134.46 ms per token,     7.44 tokens per second)
llama_perf_context_print:        eval time =   24659.97 ms /    49 runs   (  503.26 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26459.46 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  50%|█████     | 5/10 [02:11<02:11, 26.34s/it]Llama.generate: 1 prefix-match hit, remaining 14 prompt tokens to eval



Prompt 5:
TTFT: 1749.40 ms
TPOT: 494.25 ms/token
E2E Measured: 26461.97 ms
E2E Formula:  26461.86 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1861.10 ms /    14 tokens (  132.94 ms per token,     7.52 tokens per second)
llama_perf_context_print:        eval time =   24631.96 ms /    49 runs   (  502.69 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26544.62 ms /    63 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  60%|██████    | 6/10 [02:38<01:45, 26.41s/it]Llama.generate: 1 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 6:
TTFT: 1862.50 ms
TPOT: 493.70 ms/token
E2E Measured: 26547.36 ms
E2E Formula:  26547.27 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1765.94 ms /    13 tokens (  135.84 ms per token,     7.36 tokens per second)
llama_perf_context_print:        eval time =   24639.77 ms /    49 runs   (  502.85 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26457.30 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  70%|███████   | 7/10 [03:04<01:19, 26.43s/it]Llama.generate: 1 prefix-match hit, remaining 8 prompt tokens to eval



Prompt 7:
TTFT: 1767.38 ms
TPOT: 493.85 ms/token
E2E Measured: 26459.85 ms
E2E Formula:  26459.76 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1433.15 ms /     8 tokens (  179.14 ms per token,     5.58 tokens per second)
llama_perf_context_print:        eval time =   24656.64 ms /    49 runs   (  503.20 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26141.82 ms /    57 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  80%|████████  | 8/10 [03:30<00:52, 26.34s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 8:
TTFT: 1434.53 ms
TPOT: 494.19 ms/token
E2E Measured: 26144.34 ms
E2E Formula:  26144.24 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1558.91 ms /    10 tokens (  155.89 ms per token,     6.41 tokens per second)
llama_perf_context_print:        eval time =   24689.05 ms /    49 runs   (  503.86 ms per token,     1.98 tokens per second)
llama_perf_context_print:       total time =   26299.75 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  90%|█████████ | 9/10 [03:57<00:26, 26.33s/it]Llama.generate: 2 prefix-match hit, remaining 11 prompt tokens to eval



Prompt 9:
TTFT: 1560.32 ms
TPOT: 494.84 ms/token
E2E Measured: 26302.23 ms
E2E Formula:  26302.13 ms


llama_perf_context_print:        load time =    1690.17 ms
llama_perf_context_print: prompt eval time =    1605.02 ms /    11 tokens (  145.91 ms per token,     6.85 tokens per second)
llama_perf_context_print:        eval time =   24641.40 ms /    49 runs   (  502.89 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26297.97 ms /    60 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts: 100%|██████████| 10/10 [04:23<00:00, 26.34s/it]



Prompt 10:
TTFT: 1606.45 ms
TPOT: 493.88 ms/token
E2E Measured: 26300.46 ms
E2E Formula:  26300.36 ms

--- Overall Token Latency Stats ---
Average TTFT: 1629.34 ms
Average TPOT: 494.15 ms
P50: 503.52 ms, P95: 505.90 ms, P99: 507.95 ms
{'avg_ttft_ms': 1629.342246055603, 'avg_tpot_ms': 494.15307092666626, 'p50_ms': 503.517746925354, 'p95_ms': 505.8958053588867, 'p99_ms': 507.95475721359253, 'load_time_s': 14.15773057937622}


In [21]:
results = evaluate_llama_latency(repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v6",filename="ft-qlora-q4_k_m.gguf", test_dataset=test_dataset)
print(results)

ft-qlora-q4_k_m.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from latency/models--sharshar20--llama3.2_3B_instruct-GGUF-v6/snapshots/6404c56ca2726fd54b86b260c71365a86b1bbd95/ft-qlora-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Qlora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.e


Prompt 1:
TTFT: 1464.71 ms
TPOT: 218.92 ms/token
E2E Measured: 12410.94 ms
E2E Formula:  12410.74 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1414.90 ms /    10 tokens (  141.49 ms per token,     7.07 tokens per second)
llama_perf_context_print:        eval time =   10914.38 ms /    49 runs   (  222.74 ms per token,     4.49 tokens per second)
llama_perf_context_print:       total time =   12380.40 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  20%|██        | 2/10 [00:24<01:39, 12.40s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 2:
TTFT: 1416.40 ms
TPOT: 219.34 ms/token
E2E Measured: 12383.31 ms
E2E Formula:  12383.20 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1426.47 ms /    10 tokens (  142.65 ms per token,     7.01 tokens per second)
llama_perf_context_print:        eval time =   10933.25 ms /    49 runs   (  223.13 ms per token,     4.48 tokens per second)
llama_perf_context_print:       total time =   12410.67 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  30%|███       | 3/10 [00:37<01:26, 12.40s/it]Llama.generate: 2 prefix-match hit, remaining 9 prompt tokens to eval



Prompt 3:
TTFT: 1427.95 ms
TPOT: 219.71 ms/token
E2E Measured: 12413.34 ms
E2E Formula:  12413.24 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1310.45 ms /     9 tokens (  145.61 ms per token,     6.87 tokens per second)
llama_perf_context_print:        eval time =   10982.60 ms /    49 runs   (  224.13 ms per token,     4.46 tokens per second)
llama_perf_context_print:       total time =   12343.97 ms /    58 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  40%|████      | 4/10 [00:49<01:14, 12.38s/it]Llama.generate: 2 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 4:
TTFT: 1311.94 ms
TPOT: 220.70 ms/token
E2E Measured: 12346.79 ms
E2E Formula:  12346.69 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1562.46 ms /    13 tokens (  120.19 ms per token,     8.32 tokens per second)
llama_perf_context_print:        eval time =   10904.39 ms /    49 runs   (  222.54 ms per token,     4.49 tokens per second)
llama_perf_context_print:       total time =   12517.86 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  50%|█████     | 5/10 [01:02<01:02, 12.43s/it]Llama.generate: 1 prefix-match hit, remaining 14 prompt tokens to eval



Prompt 5:
TTFT: 1564.00 ms
TPOT: 219.13 ms/token
E2E Measured: 12520.70 ms
E2E Formula:  12520.61 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1671.36 ms /    14 tokens (  119.38 ms per token,     8.38 tokens per second)
llama_perf_context_print:        eval time =   10890.87 ms /    49 runs   (  222.26 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12614.79 ms /    63 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  60%|██████    | 6/10 [01:14<00:49, 12.50s/it]Llama.generate: 1 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 6:
TTFT: 1672.82 ms
TPOT: 218.89 ms/token
E2E Measured: 12617.41 ms
E2E Formula:  12617.31 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1567.92 ms /    13 tokens (  120.61 ms per token,     8.29 tokens per second)
llama_perf_context_print:        eval time =   10894.34 ms /    49 runs   (  222.33 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12513.45 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  70%|███████   | 7/10 [01:27<00:37, 12.50s/it]Llama.generate: 1 prefix-match hit, remaining 8 prompt tokens to eval



Prompt 7:
TTFT: 1569.42 ms
TPOT: 218.93 ms/token
E2E Measured: 12516.08 ms
E2E Formula:  12515.99 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1197.20 ms /     8 tokens (  149.65 ms per token,     6.68 tokens per second)
llama_perf_context_print:        eval time =   10947.45 ms /    49 runs   (  223.42 ms per token,     4.48 tokens per second)
llama_perf_context_print:       total time =   12195.54 ms /    57 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  80%|████████  | 8/10 [01:39<00:24, 12.41s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 8:
TTFT: 1198.65 ms
TPOT: 220.00 ms/token
E2E Measured: 12198.74 ms
E2E Formula:  12198.65 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1416.67 ms /    10 tokens (  141.67 ms per token,     7.06 tokens per second)
llama_perf_context_print:        eval time =   10918.48 ms /    49 runs   (  222.83 ms per token,     4.49 tokens per second)
llama_perf_context_print:       total time =   12386.10 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  90%|█████████ | 9/10 [01:51<00:12, 12.40s/it]Llama.generate: 2 prefix-match hit, remaining 11 prompt tokens to eval



Prompt 9:
TTFT: 1418.13 ms
TPOT: 219.41 ms/token
E2E Measured: 12388.63 ms
E2E Formula:  12388.53 ms


llama_perf_context_print:        load time =    1463.46 ms
llama_perf_context_print: prompt eval time =    1518.70 ms /    11 tokens (  138.06 ms per token,     7.24 tokens per second)
llama_perf_context_print:        eval time =   10910.04 ms /    49 runs   (  222.65 ms per token,     4.49 tokens per second)
llama_perf_context_print:       total time =   12479.76 ms /    60 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts: 100%|██████████| 10/10 [02:04<00:00, 12.43s/it]


Prompt 10:
TTFT: 1520.17 ms
TPOT: 219.25 ms/token
E2E Measured: 12482.55 ms
E2E Formula:  12482.45 ms

--- Overall Token Latency Stats ---
Average TTFT: 1456.42 ms
Average TPOT: 219.43 ms
P50: 223.27 ms, P95: 227.49 ms, P99: 233.89 ms
{'avg_ttft_ms': 1456.4182758331299, 'avg_tpot_ms': 219.4264702796936, 'p50_ms': 223.2745885848999, 'p95_ms': 227.4920105934143, 'p99_ms': 233.88677835464478, 'load_time_s': 4.971662282943726}





In [18]:
results = evaluate_llama_latency(repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v6",filename="ft-q5_k_m-lora.gguf", test_dataset=test_dataset)
print(results)

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from latency/models--sharshar20--llama3.2_3B_instruct-GGUF-v6/snapshots/6404c56ca2726fd54b86b260c71365a86b1bbd95/ft-q5_k_m-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Lora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.emb


Prompt 1:
TTFT: 2500.49 ms
TPOT: 258.19 ms/token
E2E Measured: 15410.40 ms
E2E Formula:  15410.22 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2181.82 ms /    10 tokens (  218.18 ms per token,     4.58 tokens per second)
llama_perf_context_print:        eval time =   12869.08 ms /    49 runs   (  262.63 ms per token,     3.81 tokens per second)
llama_perf_context_print:       total time =   15102.66 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  20%|██        | 2/10 [00:30<02:01, 15.23s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 2:
TTFT: 2183.29 ms
TPOT: 258.44 ms/token
E2E Measured: 15105.39 ms
E2E Formula:  15105.30 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2180.93 ms /    10 tokens (  218.09 ms per token,     4.59 tokens per second)
llama_perf_context_print:        eval time =   12863.14 ms /    49 runs   (  262.51 ms per token,     3.81 tokens per second)
llama_perf_context_print:       total time =   15096.62 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  30%|███       | 3/10 [00:45<01:46, 15.17s/it]Llama.generate: 2 prefix-match hit, remaining 9 prompt tokens to eval



Prompt 3:
TTFT: 2182.33 ms
TPOT: 258.34 ms/token
E2E Measured: 15099.27 ms
E2E Formula:  15099.15 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2036.47 ms /     9 tokens (  226.27 ms per token,     4.42 tokens per second)
llama_perf_context_print:        eval time =   12854.13 ms /    49 runs   (  262.33 ms per token,     3.81 tokens per second)
llama_perf_context_print:       total time =   14942.39 ms /    58 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  40%|████      | 4/10 [01:00<01:30, 15.08s/it]Llama.generate: 2 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 4:
TTFT: 2038.01 ms
TPOT: 258.14 ms/token
E2E Measured: 14944.91 ms
E2E Formula:  14944.83 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2648.94 ms /    13 tokens (  203.76 ms per token,     4.91 tokens per second)
llama_perf_context_print:        eval time =   12842.91 ms /    49 runs   (  262.10 ms per token,     3.82 tokens per second)
llama_perf_context_print:       total time =   15543.42 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  50%|█████     | 5/10 [01:16<01:16, 15.25s/it]Llama.generate: 1 prefix-match hit, remaining 14 prompt tokens to eval



Prompt 5:
TTFT: 2650.39 ms
TPOT: 257.91 ms/token
E2E Measured: 15546.19 ms
E2E Formula:  15546.09 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2754.19 ms /    14 tokens (  196.73 ms per token,     5.08 tokens per second)
llama_perf_context_print:        eval time =   12840.89 ms /    49 runs   (  262.06 ms per token,     3.82 tokens per second)
llama_perf_context_print:       total time =   15646.78 ms /    63 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  60%|██████    | 6/10 [01:31<01:01, 15.39s/it]Llama.generate: 1 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 6:
TTFT: 2755.66 ms
TPOT: 257.87 ms/token
E2E Measured: 15649.49 ms
E2E Formula:  15649.40 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2605.91 ms /    13 tokens (  200.45 ms per token,     4.99 tokens per second)
llama_perf_context_print:        eval time =   12912.18 ms /    49 runs   (  263.51 ms per token,     3.79 tokens per second)
llama_perf_context_print:       total time =   15569.87 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  70%|███████   | 7/10 [01:47<00:46, 15.45s/it]Llama.generate: 1 prefix-match hit, remaining 8 prompt tokens to eval



Prompt 7:
TTFT: 2607.45 ms
TPOT: 259.30 ms/token
E2E Measured: 15572.57 ms
E2E Formula:  15572.47 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    1880.72 ms /     8 tokens (  235.09 ms per token,     4.25 tokens per second)
llama_perf_context_print:        eval time =   12843.13 ms /    49 runs   (  262.10 ms per token,     3.82 tokens per second)
llama_perf_context_print:       total time =   14775.63 ms /    57 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  80%|████████  | 8/10 [02:02<00:30, 15.23s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 8:
TTFT: 1882.13 ms
TPOT: 257.92 ms/token
E2E Measured: 14778.13 ms
E2E Formula:  14778.04 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2172.46 ms /    10 tokens (  217.25 ms per token,     4.60 tokens per second)
llama_perf_context_print:        eval time =   12856.64 ms /    49 runs   (  262.38 ms per token,     3.81 tokens per second)
llama_perf_context_print:       total time =   15080.54 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  90%|█████████ | 9/10 [02:17<00:15, 15.19s/it]Llama.generate: 2 prefix-match hit, remaining 11 prompt tokens to eval



Prompt 9:
TTFT: 2173.89 ms
TPOT: 258.19 ms/token
E2E Measured: 15083.24 ms
E2E Formula:  15083.15 ms


llama_perf_context_print:        load time =    2498.68 ms
llama_perf_context_print: prompt eval time =    2323.15 ms /    11 tokens (  211.20 ms per token,     4.73 tokens per second)
llama_perf_context_print:        eval time =   12852.91 ms /    49 runs   (  262.30 ms per token,     3.81 tokens per second)
llama_perf_context_print:       total time =   15227.64 ms /    60 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts: 100%|██████████| 10/10 [02:32<00:00, 15.24s/it]


Prompt 10:
TTFT: 2324.56 ms
TPOT: 258.11 ms/token
E2E Measured: 15230.21 ms
E2E Formula:  15230.12 ms

--- Overall Token Latency Stats ---
Average TTFT: 2329.82 ms
Average TPOT: 258.24 ms
P50: 263.01 ms, P95: 265.49 ms, P99: 268.31 ms
{'avg_ttft_ms': 2329.8202991485596, 'avg_tpot_ms': 258.24116039276123, 'p50_ms': 263.0103826522827, 'p95_ms': 265.4910445213318, 'p99_ms': 268.31345319747925, 'load_time_s': 6.694820880889893}





In [23]:
results = evaluate_llama_latency(repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v6",filename="ft-q4_k_m-lora.gguf", test_dataset=test_dataset)
print(results)

ft-q4_k_m-lora.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from latency/models--sharshar20--llama3.2_3B_instruct-GGUF-v6/snapshots/6404c56ca2726fd54b86b260c71365a86b1bbd95/ft-q4_k_m-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Lora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.emb


Prompt 1:
TTFT: 1459.11 ms
TPOT: 218.79 ms/token
E2E Measured: 12398.80 ms
E2E Formula:  12398.70 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1477.55 ms /    10 tokens (  147.76 ms per token,     6.77 tokens per second)
llama_perf_context_print:        eval time =   10892.32 ms /    49 runs   (  222.29 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12420.75 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  20%|██        | 2/10 [00:24<01:39, 12.41s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 2:
TTFT: 1478.95 ms
TPOT: 218.88 ms/token
E2E Measured: 12423.22 ms
E2E Formula:  12423.12 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1417.93 ms /    10 tokens (  141.79 ms per token,     7.05 tokens per second)
llama_perf_context_print:        eval time =   10893.18 ms /    49 runs   (  222.31 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12361.93 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  30%|███       | 3/10 [00:37<01:26, 12.39s/it]Llama.generate: 2 prefix-match hit, remaining 9 prompt tokens to eval



Prompt 3:
TTFT: 1419.31 ms
TPOT: 218.90 ms/token
E2E Measured: 12364.48 ms
E2E Formula:  12364.38 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1310.39 ms /     9 tokens (  145.60 ms per token,     6.87 tokens per second)
llama_perf_context_print:        eval time =   10894.76 ms /    49 runs   (  222.34 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12255.79 ms /    58 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  40%|████      | 4/10 [00:49<01:14, 12.34s/it]Llama.generate: 2 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 4:
TTFT: 1311.79 ms
TPOT: 218.93 ms/token
E2E Measured: 12258.26 ms
E2E Formula:  12258.17 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1564.79 ms /    13 tokens (  120.37 ms per token,     8.31 tokens per second)
llama_perf_context_print:        eval time =   10891.31 ms /    49 runs   (  222.27 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12507.07 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  50%|█████     | 5/10 [01:01<01:02, 12.40s/it]Llama.generate: 1 prefix-match hit, remaining 14 prompt tokens to eval



Prompt 5:
TTFT: 1566.25 ms
TPOT: 218.87 ms/token
E2E Measured: 12509.69 ms
E2E Formula:  12509.59 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1671.20 ms /    14 tokens (  119.37 ms per token,     8.38 tokens per second)
llama_perf_context_print:        eval time =   10872.06 ms /    49 runs   (  221.88 ms per token,     4.51 tokens per second)
llama_perf_context_print:       total time =   12594.11 ms /    63 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  60%|██████    | 6/10 [01:14<00:49, 12.47s/it]Llama.generate: 1 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 6:
TTFT: 1672.61 ms
TPOT: 218.48 ms/token
E2E Measured: 12596.96 ms
E2E Formula:  12596.86 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1557.50 ms /    13 tokens (  119.81 ms per token,     8.35 tokens per second)
llama_perf_context_print:        eval time =   10874.26 ms /    49 runs   (  221.92 ms per token,     4.51 tokens per second)
llama_perf_context_print:       total time =   12482.54 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  70%|███████   | 7/10 [01:27<00:37, 12.47s/it]Llama.generate: 1 prefix-match hit, remaining 8 prompt tokens to eval



Prompt 7:
TTFT: 1558.94 ms
TPOT: 218.53 ms/token
E2E Measured: 12485.36 ms
E2E Formula:  12485.26 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1195.91 ms /     8 tokens (  149.49 ms per token,     6.69 tokens per second)
llama_perf_context_print:        eval time =   10887.20 ms /    49 runs   (  222.19 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12133.76 ms /    57 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  80%|████████  | 8/10 [01:39<00:24, 12.37s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 8:
TTFT: 1197.32 ms
TPOT: 218.78 ms/token
E2E Measured: 12136.53 ms
E2E Formula:  12136.43 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1414.89 ms /    10 tokens (  141.49 ms per token,     7.07 tokens per second)
llama_perf_context_print:        eval time =   10887.29 ms /    49 runs   (  222.19 ms per token,     4.50 tokens per second)
llama_perf_context_print:       total time =   12352.96 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  90%|█████████ | 9/10 [01:51<00:12, 12.36s/it]Llama.generate: 2 prefix-match hit, remaining 11 prompt tokens to eval



Prompt 9:
TTFT: 1416.28 ms
TPOT: 218.78 ms/token
E2E Measured: 12355.47 ms
E2E Formula:  12355.38 ms


llama_perf_context_print:        load time =    1457.88 ms
llama_perf_context_print: prompt eval time =    1520.83 ms /    11 tokens (  138.26 ms per token,     7.23 tokens per second)
llama_perf_context_print:        eval time =   10920.58 ms /    49 runs   (  222.87 ms per token,     4.49 tokens per second)
llama_perf_context_print:       total time =   12492.20 ms /    60 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts: 100%|██████████| 10/10 [02:04<00:00, 12.40s/it]


Prompt 10:
TTFT: 1522.22 ms
TPOT: 219.45 ms/token
E2E Measured: 12494.93 ms
E2E Formula:  12494.83 ms

--- Overall Token Latency Stats ---
Average TTFT: 1460.28 ms
Average TPOT: 218.84 ms
P50: 222.99 ms, P95: 225.53 ms, P99: 226.51 ms
{'avg_ttft_ms': 1460.2785348892212, 'avg_tpot_ms': 218.83984422683716, 'p50_ms': 222.98824787139893, 'p95_ms': 225.52956342697144, 'p99_ms': 226.5090847015381, 'load_time_s': 5.153078317642212}





In [24]:
results = evaluate_llama_latency(repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v6",filename="ft-q8_0-lora.gguf", test_dataset=test_dataset)
print(results)

ft-q8_0-lora.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from latency/models--sharshar20--llama3.2_3B_instruct-GGUF-v6/snapshots/6404c56ca2726fd54b86b260c71365a86b1bbd95/ft-q8_0-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Lora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embed


Prompt 1:
TTFT: 1568.57 ms
TPOT: 305.82 ms/token
E2E Measured: 16859.73 ms
E2E Formula:  16859.63 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1462.24 ms /    10 tokens (  146.22 ms per token,     6.84 tokens per second)
llama_perf_context_print:        eval time =   15226.22 ms /    49 runs   (  310.74 ms per token,     3.22 tokens per second)
llama_perf_context_print:       total time =   16739.83 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  20%|██        | 2/10 [00:33<02:14, 16.79s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 2:
TTFT: 1463.65 ms
TPOT: 305.58 ms/token
E2E Measured: 16742.58 ms
E2E Formula:  16742.48 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1465.47 ms /    10 tokens (  146.55 ms per token,     6.82 tokens per second)
llama_perf_context_print:        eval time =   15242.80 ms /    49 runs   (  311.08 ms per token,     3.21 tokens per second)
llama_perf_context_print:       total time =   16759.72 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  30%|███       | 3/10 [00:50<01:57, 16.78s/it]Llama.generate: 2 prefix-match hit, remaining 9 prompt tokens to eval



Prompt 3:
TTFT: 1466.87 ms
TPOT: 305.91 ms/token
E2E Measured: 16762.23 ms
E2E Formula:  16762.14 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1411.30 ms /     9 tokens (  156.81 ms per token,     6.38 tokens per second)
llama_perf_context_print:        eval time =   15297.95 ms /    49 runs   (  312.20 ms per token,     3.20 tokens per second)
llama_perf_context_print:       total time =   16761.09 ms /    58 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  40%|████      | 4/10 [01:07<01:40, 16.77s/it]Llama.generate: 2 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 4:
TTFT: 1412.70 ms
TPOT: 307.04 ms/token
E2E Measured: 16764.62 ms
E2E Formula:  16764.52 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1704.77 ms /    13 tokens (  131.14 ms per token,     7.63 tokens per second)
llama_perf_context_print:        eval time =   15234.59 ms /    49 runs   (  310.91 ms per token,     3.22 tokens per second)
llama_perf_context_print:       total time =   16990.81 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  50%|█████     | 5/10 [01:24<01:24, 16.85s/it]Llama.generate: 1 prefix-match hit, remaining 14 prompt tokens to eval



Prompt 5:
TTFT: 1706.18 ms
TPOT: 305.75 ms/token
E2E Measured: 16993.76 ms
E2E Formula:  16993.66 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1752.79 ms /    14 tokens (  125.20 ms per token,     7.99 tokens per second)
llama_perf_context_print:        eval time =   15239.13 ms /    49 runs   (  311.00 ms per token,     3.22 tokens per second)
llama_perf_context_print:       total time =   17043.44 ms /    63 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  60%|██████    | 6/10 [01:41<01:07, 16.92s/it]Llama.generate: 1 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 6:
TTFT: 1754.18 ms
TPOT: 305.84 ms/token
E2E Measured: 17046.40 ms
E2E Formula:  17046.29 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1715.67 ms /    13 tokens (  131.97 ms per token,     7.58 tokens per second)
llama_perf_context_print:        eval time =   15224.07 ms /    49 runs   (  310.70 ms per token,     3.22 tokens per second)
llama_perf_context_print:       total time =   16991.09 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  70%|███████   | 7/10 [01:58<00:50, 16.94s/it]Llama.generate: 1 prefix-match hit, remaining 8 prompt tokens to eval



Prompt 7:
TTFT: 1717.12 ms
TPOT: 305.53 ms/token
E2E Measured: 16993.59 ms
E2E Formula:  16993.49 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1280.00 ms /     8 tokens (  160.00 ms per token,     6.25 tokens per second)
llama_perf_context_print:        eval time =   15237.88 ms /    49 runs   (  310.98 ms per token,     3.22 tokens per second)
llama_perf_context_print:       total time =   16569.24 ms /    57 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  80%|████████  | 8/10 [02:14<00:33, 16.83s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 8:
TTFT: 1281.39 ms
TPOT: 305.81 ms/token
E2E Measured: 16571.78 ms
E2E Formula:  16571.68 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1460.13 ms /    10 tokens (  146.01 ms per token,     6.85 tokens per second)
llama_perf_context_print:        eval time =   15236.18 ms /    49 runs   (  310.94 ms per token,     3.22 tokens per second)
llama_perf_context_print:       total time =   16747.63 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  90%|█████████ | 9/10 [02:31<00:16, 16.80s/it]Llama.generate: 2 prefix-match hit, remaining 11 prompt tokens to eval



Prompt 9:
TTFT: 1461.55 ms
TPOT: 305.78 ms/token
E2E Measured: 16750.45 ms
E2E Formula:  16750.34 ms


llama_perf_context_print:        load time =    1567.26 ms
llama_perf_context_print: prompt eval time =    1515.58 ms /    11 tokens (  137.78 ms per token,     7.26 tokens per second)
llama_perf_context_print:        eval time =   15237.10 ms /    49 runs   (  310.96 ms per token,     3.22 tokens per second)
llama_perf_context_print:       total time =   16804.10 ms /    60 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts: 100%|██████████| 10/10 [02:48<00:00, 16.83s/it]


Prompt 10:
TTFT: 1516.97 ms
TPOT: 305.79 ms/token
E2E Measured: 16806.60 ms
E2E Formula:  16806.49 ms

--- Overall Token Latency Stats ---
Average TTFT: 1534.92 ms
Average TPOT: 305.88 ms
P50: 311.65 ms, P95: 314.01 ms, P99: 315.34 ms
{'avg_ttft_ms': 1534.9178075790405, 'avg_tpot_ms': 305.88307428359985, 'p50_ms': 311.651349067688, 'p95_ms': 314.00651931762695, 'p99_ms': 315.3407287597656, 'load_time_s': 11.904460668563843}





In [25]:
results = evaluate_llama_latency(repo_id="sharshar20/llama3.2_3B_instruct-GGUF-v6",filename="fp16-ft-lora.gguf", test_dataset=test_dataset)
print(results)

fp16-ft-lora.gguf:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from latency/models--sharshar20--llama3.2_3B_instruct-GGUF-v6/snapshots/6404c56ca2726fd54b86b260c71365a86b1bbd95/fp16-ft-lora.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Lora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embed


Prompt 1:
TTFT: 1674.35 ms
TPOT: 494.41 ms/token
E2E Measured: 26395.14 ms
E2E Formula:  26395.04 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1541.42 ms /    10 tokens (  154.14 ms per token,     6.49 tokens per second)
llama_perf_context_print:        eval time =   24662.82 ms /    49 runs   (  503.32 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26255.83 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  20%|██        | 2/10 [00:52<03:30, 26.32s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 2:
TTFT: 1542.81 ms
TPOT: 494.32 ms/token
E2E Measured: 26258.69 ms
E2E Formula:  26258.59 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1543.74 ms /    10 tokens (  154.37 ms per token,     6.48 tokens per second)
llama_perf_context_print:        eval time =   24656.61 ms /    49 runs   (  503.20 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26251.92 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  30%|███       | 3/10 [01:18<03:04, 26.29s/it]Llama.generate: 2 prefix-match hit, remaining 9 prompt tokens to eval



Prompt 3:
TTFT: 1545.12 ms
TPOT: 494.18 ms/token
E2E Measured: 26254.47 ms
E2E Formula:  26254.37 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1472.87 ms /     9 tokens (  163.65 ms per token,     6.11 tokens per second)
llama_perf_context_print:        eval time =   24659.41 ms /    49 runs   (  503.25 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26183.80 ms /    58 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  40%|████      | 4/10 [01:45<02:37, 26.25s/it]Llama.generate: 2 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 4:
TTFT: 1474.30 ms
TPOT: 494.26 ms/token
E2E Measured: 26187.32 ms
E2E Formula:  26187.22 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1744.95 ms /    13 tokens (  134.23 ms per token,     7.45 tokens per second)
llama_perf_context_print:        eval time =   24706.07 ms /    49 runs   (  504.21 ms per token,     1.98 tokens per second)
llama_perf_context_print:       total time =   26502.93 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  50%|█████     | 5/10 [02:11<02:11, 26.34s/it]Llama.generate: 1 prefix-match hit, remaining 14 prompt tokens to eval



Prompt 5:
TTFT: 1746.35 ms
TPOT: 495.18 ms/token
E2E Measured: 26505.39 ms
E2E Formula:  26505.30 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1817.30 ms /    14 tokens (  129.81 ms per token,     7.70 tokens per second)
llama_perf_context_print:        eval time =   24651.58 ms /    49 runs   (  503.09 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26520.66 ms /    63 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  60%|██████    | 6/10 [02:38<01:45, 26.40s/it]Llama.generate: 1 prefix-match hit, remaining 13 prompt tokens to eval



Prompt 6:
TTFT: 1818.71 ms
TPOT: 494.10 ms/token
E2E Measured: 26523.56 ms
E2E Formula:  26523.47 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1769.42 ms /    13 tokens (  136.11 ms per token,     7.35 tokens per second)
llama_perf_context_print:        eval time =   24652.01 ms /    49 runs   (  503.10 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26473.05 ms /    62 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  70%|███████   | 7/10 [03:04<01:19, 26.43s/it]Llama.generate: 1 prefix-match hit, remaining 8 prompt tokens to eval



Prompt 7:
TTFT: 1770.84 ms
TPOT: 494.09 ms/token
E2E Measured: 26475.57 ms
E2E Formula:  26475.47 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1422.37 ms /     8 tokens (  177.80 ms per token,     5.62 tokens per second)
llama_perf_context_print:        eval time =   24664.41 ms /    49 runs   (  503.36 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26138.27 ms /    57 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  80%|████████  | 8/10 [03:30<00:52, 26.34s/it]Llama.generate: 2 prefix-match hit, remaining 10 prompt tokens to eval



Prompt 8:
TTFT: 1423.76 ms
TPOT: 494.34 ms/token
E2E Measured: 26140.75 ms
E2E Formula:  26140.66 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1544.18 ms /    10 tokens (  154.42 ms per token,     6.48 tokens per second)
llama_perf_context_print:        eval time =   24658.72 ms /    49 runs   (  503.24 ms per token,     1.99 tokens per second)
llama_perf_context_print:       total time =   26254.40 ms /    59 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts:  90%|█████████ | 9/10 [03:57<00:26, 26.31s/it]Llama.generate: 2 prefix-match hit, remaining 11 prompt tokens to eval



Prompt 9:
TTFT: 1545.57 ms
TPOT: 494.23 ms/token
E2E Measured: 26257.18 ms
E2E Formula:  26257.09 ms


llama_perf_context_print:        load time =    1673.15 ms
llama_perf_context_print: prompt eval time =    1595.25 ms /    11 tokens (  145.02 ms per token,     6.90 tokens per second)
llama_perf_context_print:        eval time =   24707.59 ms /    49 runs   (  504.24 ms per token,     1.98 tokens per second)
llama_perf_context_print:       total time =   26354.64 ms /    60 tokens
llama_perf_context_print:    graphs reused =         47
Evaluating prompts: 100%|██████████| 10/10 [04:23<00:00, 26.34s/it]



Prompt 10:
TTFT: 1596.66 ms
TPOT: 495.21 ms/token
E2E Measured: 26357.14 ms
E2E Formula:  26357.05 ms

--- Overall Token Latency Stats ---
Average TTFT: 1613.85 ms
Average TPOT: 494.43 ms
P50: 503.80 ms, P95: 506.05 ms, P99: 507.77 ms
{'avg_ttft_ms': 1613.8480424880981, 'avg_tpot_ms': 494.43152046203613, 'p50_ms': 503.7965774536133, 'p95_ms': 506.05132579803467, 'p99_ms': 507.76520252227783, 'load_time_s': 16.707031726837158}


In [47]:
from llama_cpp import Llama
import json
import time
import numpy as np

# Load your quantized GGUF model
llm = Llama(
    model_path="quantized_models/ft-q4_k_m.gguf", # LoRA INT 4 model
    n_ctx=2048, # context window in tokens (prompt + generated)
    n_threads=8,  # Adjust for CPU threads if CPU inference
    n_gpu_layers=-1,  # -1 = all layers on GPU
    use_mlock=True #asks OS to lock model in RAM to reduce paging
)

# Warm-up
first_example = test_dataset[0]
warmup_prompt = first_example["input_text"]
_ = llm(warmup_prompt, max_tokens=5)
print("Warm-up done.\n")

# Latency measurement
num_runs = 50
ttft_list = [] # in ms
token_latency_list = [] # in ms

for run in range(num_runs):
    example = test_dataset[run % len(test_dataset)]  # Loop through val set
    full_prompt = example["input_text"]

    start_time = time.perf_counter()
    first_token_time = None
    prev_time = None
    token_times = []

    # Stream output tokens
    for idx, output in enumerate(
        llm(full_prompt, max_tokens=50, stream=True)
    ):
        now = time.perf_counter()

        if idx == 0:
            # Time to first token
            first_token_time = now - start_time
            prev_time = now
        else:
            # Time per subsequent token
            token_times.append(now - prev_time)
            prev_time = now

    ttft_list.append(first_token_time * 1000)  # ms
    token_latency_list.extend([t * 1000 for t in token_times])

# Results
avg_ttft = np.mean(ttft_list)
avg_token_latency = np.mean(token_latency_list)
p50_token, p95_token, p99_token = np.percentile(token_latency_list, [50, 95, 99])

print(f"Average Time to First Token: {avg_ttft:.2f} ms")
print(f"Average Time per Output Token: {avg_token_latency:.2f} ms")
print(f"Token Latency - P50: {p50_token:.2f} ms, P95: {p95_token:.2f} ms, P99: {p99_token:.2f} ms")

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from quantized_models/ft-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Lora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 3072
llama_model_loader: - kv   7:                  llama.feed_

Warm-up done.



llama_perf_context_print:        load time =    1517.04 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   10982.60 ms /    50 runs   (  219.65 ms per token,     4.55 tokens per second)
llama_perf_context_print:       total time =   11034.67 ms /    51 tokens
llama_perf_context_print:    graphs reused =         49
Llama.generate: 7 prefix-match hit, remaining 6 prompt tokens to eval
llama_perf_context_print:        load time =    1517.04 ms
llama_perf_context_print: prompt eval time =    1153.14 ms /     6 tokens (  192.19 ms per token,     5.20 tokens per second)
llama_perf_context_print:        eval time =   10742.33 ms /    49 runs   (  219.23 ms per token,     4.56 tokens per second)
llama_perf_context_print:       total time =   11947.35 ms /    55 tokens
llama_perf_context_print:    graphs reused =         47
Llama.generate: 7 prefix-match hit, remaining 10

Average Time to First Token: 1182.33 ms
Average Time per Output Token: 215.95 ms
Token Latency - P50: 219.76 ms, P95: 223.84 ms, P99: 232.21 ms


In [44]:
from llama_cpp import Llama
import json
import time
import numpy as np

# Load your quantized GGUF model
llm = Llama(
    model_path="quantized_models/ft-qlora-q4_k_m.gguf", # QLoRA INT 4 model
    n_ctx=2048, # context window in tokens (prompt + generated)
    n_threads=8,  # Adjust for CPU threads if CPU inference
    n_gpu_layers=-1,  # -1 = all layers on GPU
    use_mlock=True #asks OS to lock model in RAM to reduce paging
)

# Warm-up
first_example = test_dataset[0]
warmup_prompt = first_example["input_text"]
_ = llm(warmup_prompt, max_tokens=5)
print("Warm-up done.\n")

# Latency measurement
num_runs = 50
ttft_list = [] # in ms
token_latency_list = [] # in ms

for run in range(num_runs):
    example = test_dataset[run % len(test_dataset)]  # Loop through val set
    full_prompt = example["input_text"]

    start_time = time.perf_counter()
    first_token_time = None
    prev_time = None
    token_times = []

    for idx, output in enumerate(
        llm(full_prompt, max_tokens=50, stream=True)
    ):
        now = time.perf_counter()

        if idx == 0:
            # Time to first token
            first_token_time = now - start_time
            prev_time = now
        else:
            # Time per subsequent token
            token_times.append(now - prev_time)
            prev_time = now

    ttft_list.append(first_token_time * 1000)  # ms
    token_latency_list.extend([t * 1000 for t in token_times])

# Results
avg_ttft = np.mean(ttft_list)
avg_token_latency = np.mean(token_latency_list)
p50_token, p95_token, p99_token = np.percentile(token_latency_list, [50, 95, 99])

print(f"Average Time to First Token: {avg_ttft:.2f} ms")
print(f"Average Time per Output Token: {avg_token_latency:.2f} ms")
print(f"Token Latency - P50: {p50_token:.2f} ms, P95: {p95_token:.2f} ms, P99: {p99_token:.2f} ms")

llama_model_loader: loaded meta data with 28 key-value pairs and 255 tensors from quantized_models/ft-qlora-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged_Model_Instruct_Qlora
llama_model_loader: - kv   3:                         general.size_label str              = 3.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 28
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 3072
llama_model_loader: - kv   7:                  llam

Warm-up done.



llama_perf_context_print:        load time =    1519.92 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   10970.69 ms /    50 runs   (  219.41 ms per token,     4.56 tokens per second)
llama_perf_context_print:       total time =   11022.68 ms /    51 tokens
llama_perf_context_print:    graphs reused =         49
Llama.generate: 7 prefix-match hit, remaining 6 prompt tokens to eval
llama_perf_context_print:        load time =    1519.92 ms
llama_perf_context_print: prompt eval time =    1152.94 ms /     6 tokens (  192.16 ms per token,     5.20 tokens per second)
llama_perf_context_print:        eval time =   10729.00 ms /    49 runs   (  218.96 ms per token,     4.57 tokens per second)
llama_perf_context_print:       total time =   11933.40 ms /    55 tokens
llama_perf_context_print:    graphs reused =         47
Llama.generate: 7 prefix-match hit, remaining 10

Average Time to First Token: 1185.94 ms
Average Time per Output Token: 215.91 ms
Token Latency - P50: 219.84 ms, P95: 223.32 ms, P99: 226.33 ms


#### Memory Bandwidth: How fast CPU/GPU can read/write from/to RAM

In [None]:
import platform
import multiprocessing

print("CPU:", platform.processor())
print("Cores:", multiprocessing.cpu_count())

# This tells you your CPU is 64-bit and follows the x86 instruction set (common for Intel and AMD CPUs).
# x86_64 means it can handle 64-bit operations, which allows addressing more memory and generally better performance for modern applications.
# Your CPU has 2 physical cores. Each core can execute instructions independently, so more cores = better parallel performance. With 2 cores, your CPU can do at most 2 independent tasks simultaneously (without hyperthreading).

CPU: x86_64
Cores: 2


In [None]:
# check CPU info
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   2
  On-line CPU(s) list:    0,1
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:           6
    Model:                85
    Thread(s) per core:   2
    Core(s) per socket:   1
    Socket(s):            1
    Stepping:             3
    BogoMIPS:             4000.36
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc
                          all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt
                          opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq
                           ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
                           aes xsave avx f16c rdrand hypervisor 

In [None]:
# check RAM
import psutil
ram = psutil.virtual_memory()
print(f"Total RAM: {ram.total / 1e9:.2f} GB")
print(f"Available RAM: {ram.available / 1e9:.2f} GB")

Total RAM: 13.61 GB
Available RAM: 7.60 GB


In [None]:
import numpy as np
import time

size = 100_000_000  # ~100 million floats ~0.8GB
a = np.random.rand(size).astype(np.float32)
b = np.zeros_like(a)

start = time.time()
b[:] = a[:]  # copy operation
end = time.time()

bandwidth = a.nbytes / (end - start) / 1e9  # GB/s
print(f"Estimated memory bandwidth: {bandwidth:.2f} GB/s")

Estimated memory bandwidth: 3.16 GB/s
