In [12]:

!pip install nltk rouge-score scikit-learn



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Downloading absl_py-2.3.0-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.7/135.7 kB[0m [31m949.3 kB/s[0m eta [36m0:00:00[0m0:01[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24987 sha256=4958295f11716bb966e802e991d8d13ff94cb8deef95de1cba18c2a80461b193
  Stored in directory: /home/avinash/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: absl-py, rouge-sc

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/avinash/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

# Step 1: Load and Chunk Ramayana
loader = PyMuPDFLoader("data/RAMAYANA.pdf")
docs = loader.load()

In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(docs)

# Step 2: Create FAISS VectorStore
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2",
                                        model_kwargs={"device": "cpu"})

vectorstore = FAISS.from_documents(chunks, embedding)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})



In [16]:
ground_truth_data = [
  {
    "question": "Who were the parents of Lord Rama?",
    "answer": "Lord Rama was born to King Dasharatha and Queen Kausalya."
  },
  {
    "question": "What was Kaikeyi's wish to King Dasharatha?",
    "answer": "Kaikeyi asked Dasharatha to exile Rama and make Bharata the king."
  }
]



In [17]:
# Step 4: Build QA Chain
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=False)



In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
import numpy as np

In [19]:
# Step 5: Scoring Functions
def compute_cosine_similarity(pred, truth):
    vect = TfidfVectorizer().fit([pred, truth])
    vecs = vect.transform([pred, truth])
    return cosine_similarity(vecs[0], vecs[1])[0][0]

rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
bleu_smooth = SmoothingFunction().method1

# Step 6: Evaluation Loop
similarities, precisions, recalls, f1s, bleus, rouges = [], [], [], [], [], []
binary_labels, binary_preds = [], []

THRESHOLD = 0.9  # Define similarity threshold for classification

print("\n📊 Evaluation Report:")
for i, item in enumerate(ground_truth_data, 1):
    q, truth = item["question"], item["answer"]
    pred = qa_chain.run(q)

    # Tokenize
    ref_tokens = nltk.word_tokenize(truth.lower())
    pred_tokens = nltk.word_tokenize(pred.lower())

    # Metrics
    sim = compute_cosine_similarity(pred, truth)
    bleu = sentence_bleu([ref_tokens], pred_tokens, smoothing_function=bleu_smooth)
    rouge_l = rouge.score(pred, truth)['rougeL'].fmeasure

    precision = len(set(pred_tokens) & set(ref_tokens)) / (len(set(pred_tokens)) + 1e-5)
    recall = len(set(pred_tokens) & set(ref_tokens)) / (len(set(ref_tokens)) + 1e-5)
    f1 = 2 * precision * recall / (precision + recall + 1e-5)

    # Record metrics
    similarities.append(sim)
    bleus.append(bleu)
    rouges.append(rouge_l)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    binary_labels.append(1)  # All ground truth are positive
    binary_preds.append(1 if sim >= THRESHOLD else 0)

    print(f"\n{i}. ❓ {q}")
    print(f"🔹 Ground Truth: {truth}")
    print(f"🔸 Prediction  : {pred}")
    print(f"📐 Cosine Sim  : {sim:.4f}")
    print(f"🟦 BLEU Score  : {bleu:.4f}")
    print(f"🟥 ROUGE-L     : {rouge_l:.4f}")
    print(f"📊 Precision   : {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Final Aggregates
print("\n📈 Summary Metrics:")
print(f"🔁 Total Samples    : {len(ground_truth_data)}")
print(f"📐 Avg Cosine Sim   : {np.mean(similarities):.4f}")
print(f"🟦 Avg BLEU Score   : {np.mean(bleus):.4f}")
print(f"🟥 Avg ROUGE-L      : {np.mean(rouges):.4f}")
print(f"📊 Avg Precision    : {np.mean(precisions):.4f}")
print(f"📊 Avg Recall       : {np.mean(recalls):.4f}")
print(f"📊 Avg F1-Score     : {np.mean(f1s):.4f}")
print(f"✅ Accuracy (Sim>{THRESHOLD}) : {accuracy_score(binary_labels, binary_preds):.4f}")


📊 Evaluation Report:

1. ❓ Who were the parents of Lord Rama?
🔹 Ground Truth: Lord Rama was born to King Dasharatha and Queen Kausalya.
🔸 Prediction  : The parents of Lord Rama were King Dasharatha and Queen Kaushalya.
📐 Cosine Sim  : 0.4039
🟦 BLEU Score  : 0.2620
🟥 ROUGE-L     : 0.5714
📊 Precision   : 0.5833, Recall: 0.6364, F1: 0.6087

2. ❓ What was Kaikeyi's wish to King Dasharatha?
🔹 Ground Truth: Kaikeyi asked Dasharatha to exile Rama and make Bharata the king.
🔸 Prediction  : Kaikeyi's wish to King Dasharatha was to have her son, Bharata, crowned as the king instead of Rama, and for Rama to be exiled to the forest for fourteen years.
📐 Cosine Sim  : 0.4564
🟦 BLEU Score  : 0.0164
🟥 ROUGE-L     : 0.2857
📊 Precision   : 0.3462, Recall: 0.7500, F1: 0.4737

📈 Summary Metrics:
🔁 Total Samples    : 2
📐 Avg Cosine Sim   : 0.4302
🟦 Avg BLEU Score   : 0.1392
🟥 Avg ROUGE-L      : 0.4286
📊 Avg Precision    : 0.4647
📊 Avg Recall       : 0.6932
📊 Avg F1-Score     : 0.5412
✅ Accuracy (Sim>0.9)

In [None]:
# Step 5: Evaluation Metrics
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_similarity(a, b):
    vect = TfidfVectorizer().fit([a, b])
    vecs = vect.transform([a, b])
    score = cosine_similarity(vecs[0], vecs[1])[0][0]
    return round(score, 4)

def compute_similarity_semantic(a, b):
    vect = TfidfVectorizer().fit([a, b])
    vecs = vect.transform([a, b])
    score = cosine_similarity(vecs[0], vecs[1])[0][0]
    return round(score, 4)



In [10]:
total = len(ground_truth_data)
exact_match = 0
similarity_scores = []

print("\n📊 Evaluation Report:")
for i, item in enumerate(ground_truth_data, 1):
    question = item["question"]
    ground_truth = item["answer"]

    prediction = qa_chain.run(question)

    similarity = compute_similarity(prediction, ground_truth)
    similarity_scores.append(similarity)

    if similarity > 0.95:
        exact_match += 1

    print(f"\n{i}. ❓ Question: {question}")
    print(f"🔹 Ground Truth: {ground_truth}")
    print(f"🔸 Prediction: {prediction}")
    print(f"✅ Similarity Score: {similarity}")




📊 Evaluation Report:


  prediction = qa_chain.run(question)



1. ❓ Question: Who were the parents of Lord Rama?
🔹 Ground Truth: Lord Rama was born to King Dasharatha and Queen Kausalya.
🔸 Prediction: The parents of Lord Rama were King Dasharatha and Queen Kaushalya.
✅ Similarity Score: 0.4039

2. ❓ Question: What was Kaikeyi's wish to King Dasharatha?
🔹 Ground Truth: Kaikeyi asked Dasharatha to exile Rama and make Bharata the king.
🔸 Prediction: Kaikeyi's wish to King Dasharatha was to have her son, Bharata, crowned as the king instead of Rama, and for Rama to be banished to live in the forest like a hermit for fourteen years.
✅ Similarity Score: 0.4319


In [11]:
# Final Metrics
avg_similarity = sum(similarity_scores) / total
hallucination_rate = round(1 - (exact_match / total), 2)

print("\n📈 Summary:")
print(f"🔁 Total Questions: {total}")
print(f"🎯 Exact Matches (Similarity > 0.95): {exact_match}")
print(f"📘 Average Similarity Score: {round(avg_similarity, 4)}")
print(f"⚠️ Hallucination Rate: {hallucination_rate}")


📈 Summary:
🔁 Total Questions: 2
🎯 Exact Matches (Similarity > 0.95): 0
📘 Average Similarity Score: 0.4179
⚠️ Hallucination Rate: 1.0
