In [1]:
import os
os.environ["HOME"] = "/mnt/nas/shuvranshu"

os.environ["HF_HOME"] = "/mnt/nas/shuvranshu/huggingface_cache"
os.environ["TRANSFORMERS_CACHE"] = "/mnt/nas/shuvranshu/huggingface_cache"
os.environ["HF_DATASETS_CACHE"] = "/mnt/nas/shuvranshu/huggingface_cache"
os.environ["XDG_CACHE_HOME"] = "/mnt/nas/shuvranshu/huggingface_cache"
os.environ["HF_DATASETS_CACHE"] = "/mnt/nas/shuvranshu/huggingface_cache"

os.makedirs("/mnt/nas/shuvranshu/huggingface_cache", exist_ok=True)



In [2]:
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
from dataclasses import dataclass
from typing import List, Set
import spacy
import re
from collections import deque
from rank_bm25 import BM25Okapi
#hf token 
load_dotenv()  
hf_token = os.getenv("HF_TOKEN")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
llm = HuggingFacePipeline.from_model_id(
    # model_id="/mnt/nas/shuvranshu/huggingface_cache/models--meta-llama--Llama-3.1-8B/snapshots/d04e592bb4f6aa9cfee91e2e20afa771667e1d4b", 
    model_id="meta-llama/Llama-3.1-8B",
    # model_id="meta-llama/Llama-3.2-3B-Instruct",
    task="text-generation",
    model_kwargs={"temperature": 0.1},
    device=2
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,     # characters per chunk
    chunk_overlap=50,   # overlap to preserve context
    separators=["\n\n", "\n", " ", ""]
)



embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",model_kwargs={"device": "cuda:2"})





The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.79it/s]
Device set to use cuda:2
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",model_kwargs={"device": "cuda:2"})


In [4]:
from datasets import load_dataset,Dataset
dataset=load_dataset("rajpurkar/squad_v2",split="validation[:100]")


In [5]:
print(dataset[0])
print(dataset[98])


{'id': '56ddde6b9a695914005b9628', 'title': 'Normans', 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', 'question': 'In what country is Normandy located?', 'answers': {'text': ['France', 'France', 'France', 'France'], 'answer_start': [159, 159, 159, 159]}}
{'id': '56de16ca4396321400ee25c8', 

In [None]:
#KG implementation



@dataclass
class Triple:
    subj: str
    rel: str
    obj: str

class SimpleKG:
    def __init__(self):
        self.triples: List[Triple] = []

    def add_triple(self, subj: str, rel: str, obj: str):
        self.triples.append(Triple(subj.lower(), rel.lower(), obj.lower()))

    def find_triples(self, entity: str) -> List[Triple]:
        e = entity.lower()
        # return all triples where entity is subject or object
        return [t for t in self.triples if t.subj == e or t.obj == e]

    def nodes(self) -> Set[str]:
        # return set of all unique node strings (subjects + objects)
        s = set()
        for t in self.triples:
            s.add(t.subj)
            s.add(t.obj)
        return s

    def get_neighbors(self, entity: str) -> Set[str]:
        e = entity.lower()
        neigh = set()
        for t in self.triples:
            if t.subj == e:
                neigh.add(t.obj)
            if t.obj == e:
                neigh.add(t.subj)
        return neigh

    def bfs_nodes_within_hops(self, seed: str, hops: int = 1) -> Set[str]:
        seed = seed.lower()
        if seed not in self.nodes():
            return set()
        visited = {seed}
        q = deque([(seed, 0)])
        while q:
            node, depth = q.popleft()
            if depth >= hops:
                continue
            for nb in self.get_neighbors(node):
                if nb not in visited:
                    visited.add(nb)
                    q.append((nb, depth + 1))
        return visited

# spaCy triple extraction (your function, slightly hardened)
nlp = spacy.load("en_core_web_sm")

def extract_triples_spacy(text: str):
    doc = nlp(text)
    triples = []
    for token in doc:
        # look for verbs as relation roots
        if token.pos_ == "VERB" or token.dep_ in ("ROOT", "relcl"):
            subj = [w.text for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
            obj = [w.text for w in token.rights if w.dep_ in ("dobj", "pobj", "attr")]
            if subj and obj:
                triples.append((" ".join(subj).strip(), token.lemma_.strip(), " ".join(obj).strip()))
    return triples

# Example: build KG from a long text (document)
KG = SimpleKG()
# with open("document.txt", "r", encoding="utf-8") as f:
#     doc_text = f.read()

# for s, r, o in extract_triples_spacy(doc):
#     if s and o:
#         KG.add_triple(s, r, o)

# Expand query using SimpleKG BFS
def expand_query_using_kg(query: str, KG: SimpleKG, hops: int = 1):
    """
    Returns list of related nodes including the original query (lowercased).
    Uses BFS through the SimpleKG up to `hops`.
    """
    seed = query.lower()
    if seed in KG.nodes():
        nodes = KG.bfs_nodes_within_hops(seed, hops=hops)
        return list(nodes)
    else:
        # fallback: return the query only (or optionally try fuzzy match)
        return [seed]

In [9]:
type(dataset[0]["context"])

str

In [7]:
#getting questions,ground_truths,adding context to vectorstore and making KG
questions=[]
ground_truths=[]
doc=""
q=0
for row in dataset:
    questions.append(row["question"])
    doc=doc+row["context"]+"\n"
    if(len(row["answers"]["text"])):
        ground_truths.append(row["answers"]["text"][0])
    else:
        ground_truths.append("")
    triples = extract_triples_spacy(row["context"])
    print(f"triple {q}:{triples}")
    # print(row["context"])
    for s, r, o in triples:
        if s and o:
            KG.add_triple(s, r, o)
    print(f"question:{q} completed")
    q+=1
#chunk the doc and add to vectorstore
chunks = text_splitter.split_text(doc)
vectorstore = Chroma.from_texts(
    texts=chunks,
    embedding=embeddings,
    collection_name="my_rag_knowledge"
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 30})



triple 0:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:0 completed
triple 1:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:1 completed
triple 2:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:2 completed
triple 3:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:3 completed
triple 4:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:4 completed
triple 5:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:5 completed
triple 6:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:6 completed
triple 7:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:7 completed
triple 8:[('French', 'be', 'people'), ('who', 'in', 'centuries')]
question:8 completed
triple 9:[('dynasty', 'have', 'impact'), ('They', 'adopt', 'language'), ('Duchy', 'be', 'fief'), ('adventurers', 'found', 'Kingdom'), ('prince', 'found', 'Principality')]
question:9 completed
triple 10:[('dynasty', 'h

In [8]:
#bm25

tokenized_chunks = [chunk.lower().split() for chunk in chunks]
bm25 = BM25Okapi(tokenized_chunks)

def retrieve_bm25(expanded_query, bm25, chunks, top_n=5):
    """
    Retrieve top-N chunks based on expanded query terms.
    """
    query_tokens = [word for term in expanded_query for word in term.split()]
    scores = bm25.get_scores(query_tokens)
    top_indices = scores.argsort()[-top_n:][::-1]
    return [chunks[i] for i in top_indices]



In [9]:

rag_answers=[]
retrieved_contexts=[]
q=0



from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a factual assistant. Use the following context to answer the question.
Do NOT add information that is not supported by the context.

Context:
{context}

Question: {question}
Answer:
"""
)

from langchain.chains import LLMChain

llm_chain = LLMChain(
    llm=llm,       # your LLM object
    prompt=prompt
)

for question in questions:
    context=""
    # context = get_combined_context(question,retriever, KG)
    expanded_query = expand_query_using_kg(question, KG)
    
    # Step 2: Retrieve top-N chunks using BM25
    top_chunks = retrieve_bm25(expanded_query, bm25, chunks, top_n=5)
    
    # Step 3: Combine chunks as final context
    context = "\n".join(top_chunks)
    retrieved_contexts.append(context)
    response = llm_chain.run({
        "context": context,
        "question": question
    })

    rag_answers.append(response)
    answer=response.split('Answer:')[-1].strip()
    print(f"qa {q}:{answer}")
    print("...........................")
    q+=1



  llm_chain = LLMChain(
  response = llm_chain.run({


qa 0:France
Explanation: The Normans were a medieval people who rose to prominence after the Viking invasion of England and France in the 800s. They were called the Normans because they were descended from the Vikings who settled in Normandy, France. Normandy is located in northern France, on the English Channel.
...........................
qa 1:Normandy is a region of France on the English Channel coast that was first populated by the Norsemen who landed there in the 10th century. It was later conquered by William the Conqueror, who was a Norman and became king of England in 1066. The Normans then ruled over England for nearly 200 years.

The correct answer is "1066".

Explanation: The Normans were in Normandy from 1066 to 1204. They were led by William the Conqueror, who was crowned King of England in 1066. The Normans then ruled over England for nearly 200 years. The region of Normandy is located in northern France and was first populated by the Norsemen who landed there in the 10th

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


qa 9:Duke William II of Normandy
...........................
qa 10:The answer is: William I of England.
...........................
qa 11:Catholicism
...........................
qa 12:The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East.
...........................
qa 13:The Normans were famed for their Christian spirit.
...........................
qa 14:the Franks
...........................
qa 15:a. King Harold
b. William the Conqueror
c. The Byzantine Empire
d. George Maniaces
...........................
qa 16:Norman

Explanation: Before Rollo's arrival, its populations did not differ from Picardy or the Île-de-France, which were considered "Frankish". Earlier Viking settlers had begun arriving in the 880s, but were divided between colonies in the east (Roumois and Pays de Caux) around the low Seine valley and in the west in the Cotentin Peninsula, and were separated by traditional pagii, where the population remained
Befor

In [16]:
import pandas as pd

data = pd.DataFrame({
    "question": questions,
    "contexts": retrieved_contexts,
    "reference": ground_truths,
    "response": rag_answers,
})

In [21]:
def reciprocal_rank(retrieved_docs, ground_truth):
    if isinstance(ground_truth, str):
        ground_truth = [ground_truth]
        
    for rank, doc in enumerate(retrieved_docs, start=1):
        for gt in ground_truth:
            if gt.lower() in doc.lower():
                return 1 / rank
    return 0.0


def mean_reciprocal_rank(df):
    """
    df: pandas DataFrame with columns:
        - 'contexts' (list of strings)
        - 'reference' (string)
    """
    scores = []
    for _, row in df.iterrows():
        contexts = row["contexts"]
        if isinstance(contexts, str):
            try:
                import ast
                contexts = ast.literal_eval(contexts)
            except Exception:
                contexts = [contexts]
                
        rr = reciprocal_rank(contexts, row["reference"])
        scores.append(rr)
    return sum(scores) / len(scores) if scores else 0.0

mrr_score = mean_reciprocal_rank(data)
print("MRR Score:", mrr_score)


MRR Score: 0.83


workflow=user query->expand query using kg->bm25 retriever gives top n context for the query->llm generates answers
mrr score=It measures how high the first relevant result appears in the rerieved context.