In [None]:
!pip install langchain langchain-community pypdf sentence-transformers faiss-cpu nltk

Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Downloading langchain_core-0.3.75-py3-none-any.whl.metadata (5.7 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.me

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from google.colab import files
import os

# Create data directory
os.makedirs("data", exist_ok=True)

# Upload your agri documents
uploaded = files.upload()
for filename in uploaded.keys():
    os.rename(filename, os.path.join("data", filename))

print("Files uploaded to ./data:")
print(os.listdir("data"))


Saving CancerQA.csv to CancerQA.csv
Files uploaded to ./data:
['CancerQA.csv']


In [None]:
import glob, json
from typing import List, Dict, Tuple
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredHTMLLoader
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from nltk.tokenize import sent_tokenize

DATA_DIR = "./data"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
TOPK_SENTENCE = 20
WINDOW_LEFT, WINDOW_RIGHT = 2, 2

# Load documents
import pandas as pd
from langchain.schema import Document

def load_documents(data_dir):
    docs = []
    for file in os.listdir(data_dir):
        if file.endswith(".csv"):
            path = os.path.join(data_dir, file)
            df = pd.read_csv(path)
            for _, row in df.iterrows():
                q = str(row["Question"])
                a = str(row["Answer"])
                docs.append(Document(
                    page_content=q,
                    metadata={"answer": a, "source": file}
                ))
    return docs


# Convert to sentence docs
def to_sentence_docs(docs: List[Document]) -> List[Document]:
    sentence_docs = []
    for d in docs:
        sents = sent_tokenize(d.page_content)
        for idx, s in enumerate(sents):
            meta = dict(d.metadata) if d.metadata else {}
            meta.update({"sent_index": idx, "total_sents": len(sents)})
            sentence_docs.append(Document(page_content=s.strip(), metadata=meta))
    return sentence_docs

# Build FAISS store
def build_sentence_store(sentence_docs: List[Document]) -> FAISS:
    emb = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    return FAISS.from_documents(sentence_docs, emb)

# Sentence-Window retriever
class SentenceWindowRetriever:
    def __init__(self, sentence_store: FAISS, full_docs: List[Document]):
        self.store = sentence_store
        self.full_docs = full_docs

    def retrieve(self, query: str, k: int):
        return self.store.similarity_search_with_score(query, k=k)

    def expand_windows(self, hits: List[Document], left: int, right: int):
        expanded = []
        source_text_cache: Dict[str, List[str]] = {}
        for d in self.full_docs:
            key = json.dumps(d.metadata, sort_keys=True)
            if key not in source_text_cache:
                source_text_cache[key] = sent_tokenize(d.page_content)
        for hit in hits:
            key = json.dumps(hit.metadata, sort_keys=True)
            sents = source_text_cache.get(key)
            if not sents:
                expanded.append(hit)
                continue
            idx = hit.metadata.get("sent_index", 0)
            start = max(0, idx - left)
            end = min(len(sents), idx + right + 1)
            window_text = " ".join(sents[start:end]).strip()
            md = dict(hit.metadata)
            md.update({"window": (start, end)})
            expanded.append(Document(page_content=window_text, metadata=md))
        uniq, seen = [], set()
        for d in expanded:
            key = (d.metadata.get("source"), d.metadata.get("page"), d.page_content)
            if key in seen:
                continue
            seen.add(key)
            uniq.append(d)
        return uniq


In [None]:


# --- Imports ---
import re
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline



# --- Step 1: Build index & retriever ---
full_docs = load_documents(DATA_DIR)
sent_docs = to_sentence_docs(full_docs)
sentence_store = build_sentence_store(sent_docs)
retriever = SentenceWindowRetriever(sentence_store, full_docs)

# --- Step 2: Query ---
query = "What is (are) Non-Small Cell Lung Cancer ?"
print(query)
hits_with_scores = retriever.retrieve(query, TOPK_SENTENCE)
hits = [d for d, _ in hits_with_scores]
expanded = retriever.expand_windows(hits, WINDOW_LEFT, WINDOW_RIGHT)

# --- Step 3: Extract answers ---
final_answers = []
for d in expanded[:3]:  # take top 3 expanded docs
    text = d.page_content

    if "\nA:" in text:  # Q/A style dataset
        ans = text.split("\nA:")[1].strip()
        final_answers.append(ans)
    else:  # Plain text format
        ans = text.split("?")[-1].strip()
        final_answers.append(ans)

best_doc, _ = hits_with_scores[0]
final_answer = best_doc.metadata.get("answer", best_doc.page_content[:400])

long_answer = final_answer

# --- Step 4: Summarize answer ---
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(long_answer, max_length=100, min_length=30, do_sample=False)[0]['summary_text']

# Split into clean sentences
sentences = re.split(r'(?<=[.!?]) +', summary)

print("\n[Final Summarized Answer]")
for s in sentences:
    if s.strip():
        print(s.strip())

# --- Step 5: Score each sentence against query ---
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query_emb = embedder.encode(query, convert_to_tensor=True)

print("\n[Final Summarized Answer with Scores]")
for sent in sentences:
    if sent.strip():
        sent_emb = embedder.encode(sent, convert_to_tensor=True)
        score = util.cos_sim(query_emb, sent_emb).item()
        print(f"[Score: {score:.4f}] {sent.strip()}")


# --- Imports ---
import numpy as np
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline

# --- Load evaluation models ---
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
qa_eval_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# --- Functions for metrics ---

def score_faithfulness(answer: str, docs: list[str]) -> float:
    """
    Faithfulness: Is the answer grounded in the retrieved docs?
    We check similarity between answer and combined doc content.
    """
    doc_text = " ".join(docs)
    answer_emb = embedder.encode(answer, convert_to_tensor=True)
    doc_emb = embedder.encode(doc_text, convert_to_tensor=True)
    return util.cos_sim(answer_emb, doc_emb).item()


def score_completeness(answer: str, docs: list[str]) -> float:
    """
    Completeness: Does the answer cover content from docs?
    We compare average similarity of each doc sentence to answer.
    """
    doc_sents = [s for s in docs if len(s.strip()) > 0]
    if not doc_sents:
        return 0.0

    ans_emb = embedder.encode(answer, convert_to_tensor=True)
    doc_embs = embedder.encode(doc_sents, convert_to_tensor=True)

    sims = util.cos_sim(ans_emb, doc_embs).cpu().numpy().flatten()
    return float(np.mean(sims))  # average coverage


def score_relevance(answer: str, query: str) -> float:
    """
    Relevance: Is the answer relevant to the query?
    Uses zero-shot classification (entailment-style).
    """
    result = qa_eval_model(answer, candidate_labels=[query])
    return float(result['scores'][0])  # relevance score


# --- Example usage ---
query = "What are the symptoms of non-small cell lung cancer"
answer = summary  # from your summarizer step
retrieved_docs = [d.page_content for d in expanded[:3]]

faithfulness = score_faithfulness(answer, retrieved_docs)
completeness = score_completeness(answer, retrieved_docs)
relevance = score_relevance(answer, query)

print("\n[Evaluation Scores]")
print(f"Faithfulness: {faithfulness:.4f}")
print(f"Completeness: {completeness:.4f}")
print(f"Relevance: {relevance:.4f}")




What is (are) Non-Small Cell Lung Cancer ?


Device set to use cuda:0



[Final Summarized Answer]
Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung.
Smoking is the major risk factor for non-smallcell lung cancer.
Current treatments do not cure the cancer.
Clinical trials are taking place to improve treatment options.

[Final Summarized Answer with Scores]
[Score: 0.8942] Non-small cell lung cancer is a disease in which malignant (cancer) cells form in the tissues of the lung.
[Score: 0.6824] Smoking is the major risk factor for non-smallcell lung cancer.
[Score: 0.4241] Current treatments do not cure the cancer.
[Score: 0.1236] Clinical trials are taking place to improve treatment options.


Device set to use cuda:0



[Evaluation Scores]
Faithfulness: 0.6724
Completeness: 0.7539
Relevance: 0.4961
