In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["TORCH_NUM_THREADS"] = "1"


import torch
torch.set_num_threads(1)


In [24]:
# Install necessary dependencies
%pip install --quiet --upgrade \
    langchain langchain-community \
    langchain-faiss jq \
    sentence-transformers transformers \
    evaluate bert-score rouge-score nltk absl-py \
    memory-profiler \
    tiktoken \
    faiss-cpu

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

# Import required modules
from langchain.chat_models import init_chat_model
from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import FAISS
from typing_extensions import List, TypedDict
from sentence_transformers import SentenceTransformer
from pathlib import Path
import torch
import json
import time
import numpy as np
import pandas as pd
import evaluate
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity
from memory_profiler import memory_usage
from transformers import logging
import tiktoken
logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load and parse your course data
file_path = './data/course_metadata.json'
data = json.loads(Path(file_path).read_text())

loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    text_content=False
)

docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)

{"credential_area": "Artificial intelligence", "title": "Build Your First Chatbot Using IBM watsonx\
{'source': 'C:\\Project\\data\\course_metadata.json', 'seq_num': 1}


In [4]:
# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
    add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
print(f"Split course data into {len(all_splits)} sub-documents.")

Split course data into 48 sub-documents.


In [5]:
# Define embedding class using gte-Qwen2-1.5B-instruct (CPU-only)
class QwenEmbeddings(Embeddings):
    def __init__(self, model_name: str = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"):
        self.device = "cpu"
        self.model = SentenceTransformer(model_name, trust_remote_code=True)
        self.model.to(self.device)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts, convert_to_numpy=True, device=self.device).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text, convert_to_numpy=True, device=self.device).tolist()


In [14]:
# Initialize embeddings and FAISS vector store
embeddings = QwenEmbeddings(model_name="Alibaba-NLP/gte-Qwen2-1.5B-instruct")
vector_db = FAISS.from_documents(all_splits, embedding=embeddings)

# (Optional) Save FAISS index
vector_db.save_local("./faiss_index")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 12.49it/s]


In [6]:
# Load RAG prompt
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")



In [7]:
# Define application state and logic
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
    
# Retrieve top similar documents from Vector Store
def retrieve(state: State):
    retrieved_docs = vector_db.similarity_search(state["question"] , k=5)
    return {"context": retrieved_docs}

# Generate response using the LLM and measure time to first token
def generate(state: State, model_name: str):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})

    start_time = time.time()
    buffer = ""
    ttft = None
    total_tokens = 0

    for chunk in llm.stream(messages):
        if not ttft:
            ttft = time.time() - start_time  # time to first token
        buffer += chunk.content  # Streamed content pieces
        total_tokens += 1
    
    end_time = time.time() - start_time  # Total time taken
    total_tokens = count_tokens(buffer, model_name=model_name)  # or gemma, etc.
    # tokens_per_second = total_tokens / end_time if end_time > 0 else 0
    tokens_per_second = total_tokens / end_time if end_time > 0 else 0

    return {"answer": buffer, "ttft": ttft, "tps": tokens_per_second}

In [8]:
# RAG QA Evaluation

# ------------- lexical metrics -------------
bleu   = evaluate.load("bleu")
rouge  = evaluate.load("rouge")
squad  = evaluate.load("squad")          # returns token‑level F1 & EM

# ------------- semantic metrics ------------
semantic_model = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)

In [None]:
# ✅ Import Evaluation Dataset
with open("./data/eval_dataset.json", "r") as f:
	eval_dataset = json.load(f)
 
# eval_dataset = eval_dataset[:3]  # Limit to 3 for testing


In [None]:
# ✅ Models to Test
models_to_test = {
    # Small-sized LLMs
    # "phi4-mini:latest": "phi4-mini:latest",
    # "gemma3:1b": "gemma3:1b",
    # "gemma3:4b": "gemma3:4b",
    # "llama3.2:1b": "llama3.2:1b",
    # "llama3.2:latest": "llama3.2:latest",
    
    # Quantised models
    "BF16": "hf.co/unsloth/gemma-3-4b-it-GGUF:BF16",
    "Q4_0": "hf.co/unsloth/gemma-3-4b-it-GGUF:Q4_0",
    "Q8_0": "hf.co/unsloth/gemma-3-4b-it-GGUF:Q8_0",
    
}

In [11]:

# ✅ Metric Functions
def compute_bert_score(pred, ref):
    P, R, F1 = score([pred], [ref], lang="en", verbose=False)
    return F1[0].item()

def compute_bleu(pred, ref):
    return bleu.compute(predictions=[pred], references=[ref])["bleu"]

def compute_rouge(pred, ref):
    return rouge.compute(predictions=[pred], references=[ref])["rougeL"]

def compute_semantic_sim(pred, ref):
    pred_emb = semantic_model.encode([pred])
    ref_emb = semantic_model.encode([ref])
    return cosine_similarity(pred_emb, ref_emb)[0][0]

def compute_faithfulness(answer: str, context_docs: List[Document]) -> float:
    context_text = " ".join(doc.page_content for doc in context_docs)
    context_embedding = semantic_model.encode([context_text], convert_to_numpy=True)
    answer_embedding = semantic_model.encode([answer], convert_to_numpy=True)
    return cosine_similarity(answer_embedding, context_embedding)[0][0]

def compute_recall_at_5(context_docs: List[Document], ref: str) -> float:
    match_found = any(ref.strip().lower() in doc.page_content.lower() for doc in context_docs[:5])
    return 1.0 if match_found else 0.0

def count_tokens(text: str, model_name: str):
    try:
        enc = tiktoken.encoding_for_model(model_name)
    except KeyError:
        enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

In [21]:
def run_rag_pipeline(q: str , model_name: str) -> dict:
    state: State = {"question": q, "context": [], "answer": ""}
    state.update(retrieve(state))
    state.update(generate(state, model_name))
    return state

In [None]:
import pandas as pd
import os
import time

NUM_RUNS = 3
RESULTS_DIR = "./results/quantised"
os.makedirs(RESULTS_DIR, exist_ok=True)

run_dfs = []

for i in range(1, NUM_RUNS + 1):
    print(f"\n🔁 Starting RAG evaluation run {i}/{NUM_RUNS}...\n")
    
    # Run the main evaluation logic
    results = []

    for model_name, model_id in models_to_test.items():
        print(f"Evaluating model: {model_name}")
        counter = 0
        llm = init_chat_model(model_id, model_provider="ollama", stream=True)

        for item in eval_dataset:
            counter += 1
            q, ref, type = item["question"], item["answer"], item["type"]
            try:
                start = time.time()
                mem_usage, state = memory_usage((run_rag_pipeline, (q, model_name), {}), retval=True, interval=0.01)
                ttft = state.get("ttft", -1)
                tps = state.get("tps", -1)
                total_time = time.time() - start
                peak_memory_mb = max(mem_usage)

                answer = state["answer"]
                retrieved_docs = state["context"]

                # Metrics
                bleu_score = compute_bleu(answer, ref)
                rouge_score = compute_rouge(answer, ref)
                prediction = {"id": str(counter), "prediction_text": answer}
                reference = {"id": str(counter), "answers": {"text": [ref], "answer_start": [0]}}
                qa_scores = squad.compute(predictions=[prediction], references=[reference])
                f1_score = qa_scores["f1"]
                bert_score_val = compute_bert_score(answer, ref)
                semantic_sim = compute_semantic_sim(answer, ref)
                faithfulness_score = compute_faithfulness(answer, retrieved_docs)
                recall_score = compute_recall_at_5(retrieved_docs, ref)

                results.append({
                    "Model": model_name,
                    "Question": q,
                    "Question Type": type,
                    "Expected Answer": ref,
                    "Generated Answer": answer,
                    "BLEU": bleu_score,
                    "ROUGE-L": rouge_score,
                    "F1": f1_score,
                    "BERTScore (F1)": bert_score_val,
                    "Semantic Sim": semantic_sim,
                    "Recall@5": recall_score,
                    "Faithfulness": faithfulness_score,
                    "Answer Length": len(answer.split()),
                    "Time to First Token (s)": round(ttft, 3),
                    "Tokens per Second": round(tps, 3),
                    "Total Time (s)": total_time,
                    "Peak Memory (MB)": round(peak_memory_mb, 2)
                })
            except Exception as e:
                print(f"⚠️ Error on item {counter}: {e}")
                results.append({
                    "Model": model_name,
                    "Question": q,
                    "Question Type": type,
                    "Expected Answer": ref,
                    "Generated Answer": str(e),
                    "BLEU": 0, "ROUGE-L": 0, "F1": 0, "BERTScore (F1)": 0,
                    "Semantic Sim": 0, "Recall@5": 0, "Faithfulness": 0,
                    "Answer Length": 0, "Time to First Token (s)": -1,
                    "Tokens per Second": -1, "Total Time (s)": -1, "Peak Memory (MB)": 0
                })

    df = pd.DataFrame(results)
    run_path = os.path.join(RESULTS_DIR, f"rag_evaluation_results_{i}.csv")
    df.to_csv(run_path, index=False)
    run_dfs.append(df)
    print(f"✅ Run {i} saved to {run_path}")

# ✅ Average metrics across runs
merged_df = pd.concat(run_dfs)
avg_df = merged_df.groupby(["Model", "Question", "Question Type", "Expected Answer"], as_index=False).mean(numeric_only=True)
avg_df.to_csv(os.path.join(RESULTS_DIR, "rag_evaluation_results_avg.csv"), index=False)
print(f"\n✅ Averaged results saved to {RESULTS_DIR}/rag_evaluation_results_avg.csv")

# ✅ Block-by-type summary like rag_model_blocks_2.csv
metrics = [
    "BLEU", "ROUGE-L", "F1", "BERTScore (F1)", "Semantic Sim", "Recall@5", "Faithfulness",
    "Answer Length", "Time to First Token (s)", "Tokens per Second", "Total Time (s)", "Peak Memory (MB)"
]

blocks = []
for qtype, df_group in avg_df.groupby("Question Type"):
    block = df_group.groupby("Model")[metrics].mean()
    block.insert(0, "Question Type", qtype)
    blocks.append(block)
    blocks.append(pd.DataFrame([[""] * block.shape[1]], columns=block.columns))  # spacer

final_block_df = pd.concat(blocks, axis=0)
final_block_path = os.path.join(RESULTS_DIR, "rag_model_blocks_avg.csv")
final_block_df.to_csv(final_block_path, index=True)
print(f"📊 Block-wise summary saved to {final_block_path}")
