In [1]:
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader


DATA_PATH="data/"

loader = DirectoryLoader(
    DATA_PATH,
    glob="*pdf",
    loader_cls=PyMuPDFLoader
)

docs=loader.load()
print(len(docs))
print(docs[0].page_content[:500])

2348
Disclosure to Promote the Right To Information
Whereas the Parliament of India has set out to provide a practical regime of right to 
information for citizens to secure access to information under the control of public authorities, 
in order to promote transparency and accountability in the working of every public authority, 
and whereas the attached publication of the Bureau of Indian Standards is of particular interest 
to the public, particularly disadvantaged communities and those engaged in


In [3]:
from langchain.schema import Document
import re

def clean_text(text):
    text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^\s*\d+\s*\$', '', text, flags=re.MULTILINE)
    text = re.sub(r"(?i)\b(references|bibliography)\b.*", "", text, flags=re.DOTALL)
    text = re.sub(r"\n{2,}", "\n", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

def clean_document(docs):
    cleaned_docs = []
    for d in docs:
        cleaned_text = clean_text(d.page_content)
        cleaned_docs.append(
            Document(page_content=cleaned_text, metadata=d.metadata)
        )
    return cleaned_docs

cleaned_docs = clean_document(docs)
print(len(cleaned_docs))
print(cleaned_docs[0].page_content[:500])

2348
Disclosure to Promote the Right To Information
Whereas the Parliament of India has set out to provide a practical regime of right to information for citizens to secure access to information under the control of public authorities, in order to promote transparency and accountability in the working of every public authority, and whereas the attached publication of the Bureau of Indian Standards is of particular interest to the public, particularly disadvantaged communities and those engaged in the


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""]
)

all_chunks = []
for doc in cleaned_docs:
    chunks = splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        all_chunks.append(
            Document(page_content=chunk, metadata={**doc.metadata, 'chunk': i})
        )

print(len(all_chunks))
print(all_chunks[1].metadata)
print(all_chunks[1].page_content[:500])

8360
{'producer': 'itext-paulo-155 (itextpdf.sf.net-lowagie.com)', 'creator': 'pdftk 1.44 - www.pdftk.com', 'creationdate': '2013-09-05T09:03:05-07:00', 'source': 'data\\1(is code).pdf', 'file_path': 'data\\1(is code).pdf', 'total_pages': 114, 'format': 'PDF 1.4', 'title': 'IS 456 (2000): Plain and Reinforced Concrete - Code of Practice', 'author': 'Bureau of Indian Standards', 'subject': 'Published Under the Right to Information Act', 'keywords': '', 'moddate': '2013-09-05T09:03:05-07:00', 'trapped': '', 'modDate': "D:20130905090305-07'00'", 'creationDate': "D:20130905090305-07'00'", 'page': 0, 'chunk': 1}
Whereas the Parliament of India has set out to provide a practical regime of right to information for citizens to secure access to information under the control of public authorities, in order to promote transparency and accountability in the


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

vector = embeddings.embed_query(all_chunks[0].page_content)
print(len(vector))

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


384


In [6]:
from langchain.vectorstores import Chroma

DB_DIR = "chroma_db"

vector_db = Chroma.from_documents(
    documents=all_chunks,
    embedding=embeddings,
    persist_directory=DB_DIR
)

vector_db.persist()
print("done")

done


  vector_db.persist()


In [None]:
from langchain_groq import ChatGroq
from langsmith import traceable
import os
from dotenv import load_dotenv

load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")


llm = ChatGroq(
    model="llama3-70b-8192",  
    api_key=groq_api_key
)

retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={'k': 2})



In [26]:
from langchain.prompts import PromptTemplate

template = """
You are an Expert Academic Research Assistant.

Use only the provided context to answer the question. 
Cite sections, pages, or PDF names whenever possible.

If the answer is not in the context, respond: "I don't have enough information."

Context:
{context}

Question:
{question}

Answer in a formal, academic style with step-by-step reasoning.
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [27]:
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [28]:
query = "explain STONE VENEER,give me in 1 sentance"
result = chain.invoke({"query": query})
print(result["result"])

According to the provided context, stone veneer can be defined as a thin slab of natural stone, typically with a nominal thickness of 31-37 mm (1 1/4 to 1 1/2 inches), affixed to the side of an existing structure, offering a similar look and feel of real stone at a significantly reduced cost (PROS section).
