In [41]:
%%capture --no-stderr
%pip install --quiet -U langchain langchain_community tiktoken langchain-nomic "nomic[local]" langchain-ollama scikit-learn langgraph tavily-python bs4


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
!ollama pull deepseek-r1:1.5b

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest 
pulling aabd4debf0c8... 100% ▕████████████████▏ 1.1 GB                         
pulling 369ca498f347... 100% ▕████████████████▏  387 B                         
pulling 6e4c38e1172f... 100% ▕████████████████▏ 1.1 KB                         
pulling f4d24e9138dd... 100% ▕████████████████▏  148 B                         
pulling a85fe2a2e58e... 100% ▕██

**Defining the Model *deepseek-r1:1.5b***

In [43]:
from langchain_ollama import ChatOllama

local_llm = "deepseek-r1:1.5b"
llm = ChatOllama(model=local_llm, temperature=0)
llm_json_mode = ChatOllama(model=local_llm, temperature=0, format="json")


**Function for Loading the JSON file and created VectorBase**

In [4]:
import json
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings

# Load the JSON file
with open("vectorstore.json", "r", encoding="utf-8") as f:
    vector_data = json.load(f)

# Reconstruct the vector store
vectorstore = SKLearnVectorStore.from_texts(
    texts=vector_data["documents"],
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
    metadatas=vector_data["metadata"],
)

print("Vector store reloaded from JSON!")


Embedding texts: 100%|██████████| 16/16 [00:13<00:00,  1.20inputs/s]

Vector store reloaded from JSON!





In [7]:
import os
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings
from langchain.schema import Document  # Import Document class

# Get file list for JSON files
json_files = [f for f in os.listdir() if f.endswith(".json")]
docs_list = []

# Load JSON files
for json_path in json_files:
    with open(json_path, 'r', encoding='utf-8') as f:  # Use UTF-8 encoding
        try:
            data = json.load(f)
            
            # Assuming JSON is a list of page data
            if isinstance(data, list):
                for page in data:
                    title = page.get('title', '')
                    content = page.get('content', '')
                    url = page.get('url', '')
                    timestamp = page.get('timestamp', '')
                    
                    # Create Document object with content and metadata
                    document = Document(
                        page_content=content,
                        metadata={"title": title, "url": url, "timestamp": timestamp}
                    )
                    docs_list.append(document)
        except UnicodeDecodeError as e:
            print(f"Error decoding {json_path}: {e}")

# Step 2: Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(docs_list)

# Step 3: Add to vectorDB using embeddings
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Step 4: Create retriever (ensure k does not exceed available documents)
retriever = vectorstore.as_retriever(k=min(2, len(doc_splits)))

print(f"Loaded {len(doc_splits)} document chunks from {len(json_files)} JSON files.")


Embedding texts:   0%|          | 0/3921 [00:00<?, ?inputs/s]

KeyboardInterrupt: 

**Creating Vectorbase from the Scraped Data**

In [8]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader  # Load PDFs
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings


pdf_files = [f for f in os.listdir() if f.endswith(".pdf")]
txt_files = [f for f in os.listdir() if f.endswith(".txt")]
docs_list = []

# Load PDFs
# for pdf_path in pdf_files:
#     loader = PyMuPDFLoader(pdf_path)
#     docs_list.extend(loader.load())  # Append loaded documents from PDFs

# Load TXT files
for txt_path in txt_files:
    loader = TextLoader(txt_path)
    docs_list.extend(loader.load())


# # Step 1: Load all PDFs in the current directory
# pdf_files = [f for f in os.listdir() if f.endswith(".pdf")]
# docs_list = []

# for pdf_path in pdf_files:
#     loader = PyMuPDFLoader(pdf_path)
#     docs_list.extend(loader.load())  # Append loaded documents

# Step 2: Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(docs_list)

# Step 3: Add to vectorDB
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Step 4: Create retriever (ensure k does not exceed available documents)
retriever = vectorstore.as_retriever(k=min(2, len(doc_splits)))

print(f"Loaded {len(doc_splits)} document chunks from {len(pdf_files)} PDFs.")



RuntimeError: Error loading Academic Units Academic Units Acade.txt

In [10]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings
#from langchain.document_loaders import Document


txt_files = [f for f in os.listdir() if f.endswith(".txt")]
docs_list = []

# Load TXT files with custom encoding handling
for txt_path in txt_files:
    try:
        with open(txt_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
            docs_list.append(Document(page_content=content, metadata={"source": txt_path}))
    except UnicodeDecodeError:
        print(f"Error loading {txt_path} due to encoding issues.")
    except Exception as e:
        print(f"An unexpected error occurred while loading {txt_path}: {e}")

# Step 2: Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(docs_list)

# Step 3: Add to vectorDB
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Step 4: Create retriever (ensure k does not exceed available documents)
retriever = vectorstore.as_retriever(k=min(2, len(doc_splits)))

print(f"Loaded {len(doc_splits)} document chunks from {len(txt_files)} TXT files.")


Embedding texts: 100%|██████████| 15/15 [00:28<00:00,  1.91s/inputs]

Loaded 15 document chunks from 2 TXT files.





**Vectorbase created**

In [17]:
retriever = vectorstore.as_retriever(k=10)

*Tavily API integration*

In [6]:
import os
import getpass
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set API key from environment or prompt user
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Enter value for {var}: ")

# Ensure API key is set
_set_env("TAVILY_API_KEY")

# Set parallelism setting for tokenizers
os.environ["TOKENIZERS_PARALLELISM"] = "true"


In [7]:
### Router
import json
from langchain_core.messages import HumanMessage, SystemMessage

# Prompt
# router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

# The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.

# Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

# Return JSON with single key, datasource, that is 'websearch' or 'vectorstore' depending on the question."""

router_instructions = """
You are an expert router for directing user queries.

The vectorstore contains documents related to:
- job role
- person qualification required
- skills

Use 'vectorstore' if the query is related to any of these topics.
For everything else (including current events), return 'websearch'.

### IMPORTANT ###
Your response MUST be a **valid JSON object** with exactly one key:
{
  "datasource": "vectorstore"
}
or
{
  "datasource": "websearch"
}

Do NOT include explanations, additional keys, or any other information.
"""


# Test router
test_web_search = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="What are the required skills for this job role?"
        )
    ]
)
# test_web_search_2 = llm_json_mode.invoke(
#     [SystemMessage(content=router_instructions)]
#     + [HumanMessage(content="What are the models released today for llama3.2?")]
# )
# test_vector_store = llm_json_mode.invoke(
#     [SystemMessage(content=router_instructions)]
#     + [HumanMessage(content="What are the types of agent memory?")]
# )
print(
    json.loads(test_web_search.content)
    # json.loads(test_web_search_2.content),
    # json.loads(test_vector_store.content),
)


{'datasource': 'vectorstore'}


**Assessing the goodness of Retrieved Document**

In [16]:
### Retrieval Grader

# Doc grader instructions
doc_grader_instructions = """You are a grader assessing relevance of a retrieved document to a user question.

If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant."""

# Grader prompt
doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

This carefully and objectively assess whether the document contains at least some information that is relevant to the question.

Return JSON with single key, binary_score, that is 'yes' or 'no' score to indicate whether the document contains at least some information that is relevant to the question."""

# Test
question = "What is the job description for tescra ACHNET company?"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
doc_grader_prompt_formatted = doc_grader_prompt.format(
    document=doc_txt, question=question
)
result = llm_json_mode.invoke(
    [SystemMessage(content=doc_grader_instructions)]
    + [HumanMessage(content=doc_grader_prompt_formatted)]
)
json.loads(result.content)

Embedding texts: 100%|██████████| 1/1 [00:00<00:00, 12.00inputs/s]


{'key': 'relevant', 'score': 1}

**Integration and routing of "VectorBase" and "Web Search"**

In [43]:
import requests
from langchain.schema import HumanMessage, SystemMessage
import json


# RAG Prompt
rag_prompt = """You are an assistant for question-answering tasks.

Here is the context to use to answer the question:

{context}

Think carefully about the above context.

Now, review the user question:

{question}

Provide an answer to this question using only the above context.

Use three sentences maximum and keep the answer concise.

Answer:"""

# Function to search using TAVIly API
def tavili_search(query):
    url = "https://api.tavily.com/v1/search"  # Replace with TAVIly's actual search endpoint
    headers = {
        "Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}",
        "Content-Type": "application/json"
    }
    params = {
        "query": query,
        "num_results": 5  # You can adjust the number of results you want
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        results = response.json()
        return results['data']  # Assuming the response returns a 'data' key containing search results
    else:
        return f"Error: {response.status_code} - {response.text}"


# Function to format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Function to check if retrieved docs are relevant
# def is_relevant(docs, question):
#     doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

#     Carefully and objectively assess whether the document contains at least some information that is relevant to the question.

#     ### IMPORTANT ###
#     Your response MUST be a **valid JSON object** with exactly one key:
#     {{
#       "binary_score": "Yes"
#     }}
#     or
#     {{
#       "binary_score": "No"
#     }}

#     Do NOT include explanations, additional keys, or any other information.
#     """
#     #doc_grader_prompt = """Here is the document retrieved from your search: \n\n{document}\n\nThe user has asked the following question: \n\n{question}\n\nDoes this document contain relevant information to answer the question? Please respond with either 'Yes' or 'No'."""


#     docs_txt = format_docs(docs)
#     doc_grader_prompt_formatted = doc_grader_prompt.format(document=docs_txt, question=question)

#     grading_response = llm_json_mode.invoke(
#         [SystemMessage(content="You are a grader assessing relevance of a retrieved document.")]
#         + [HumanMessage(content=doc_grader_prompt_formatted)]
#     )
#     print(f"Grading response: {grading_response.content}")
#     try:
#         grading_result = json.loads(grading_response.content)
#         return grading_result.get("binary_score", "No") == "Yes"
#     except json.JSONDecodeError:
#         return False  # If the model fails to return valid JSON, assume irrelevance.

def is_relevant(docs, question):
    doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

    Carefully and objectively assess whether the document contains **specific information** that is directly relevant to the question. 

    ### IMPORTANT ###
    - The document must explicitly mention or provide information related to the question.
    - If the document does not contain any information related to the question, return "No".
    - Your response MUST be a **valid JSON object** with exactly one key:
    {{
      "binary_score": "Yes"
    }}
    or
    {{
      "binary_score": "No"
    }}

    Do NOT include explanations, additional keys, or any other information.
    """

    docs_txt = format_docs(docs)
    doc_grader_prompt_formatted = doc_grader_prompt.format(document=docs_txt, question=question)

    # Debug: Print the formatted prompt
    print("Formatted Prompt:", doc_grader_prompt_formatted)

    grading_response = llm_json_mode.invoke(
        [SystemMessage(content="You are a grader assessing relevance of a retrieved document.")]
        + [HumanMessage(content=doc_grader_prompt_formatted)]
    )

    # Debug: Print the raw response from the model
    print("Raw Grading Response:", grading_response.content)

    try:
        grading_result = json.loads(grading_response.content)
        # Debug: Print the parsed JSON
        print("Parsed JSON:", grading_result)

        # Ensure the key exists and has a valid value
        if "binary_score" in grading_result and grading_result["binary_score"] in ["Yes", "No"]:
            # Additional check: If the document does not contain the question keywords, force "No"
            question_keywords = ["viral kohli", "IIT KGP campus"]
            if not any(keyword.lower() in docs_txt.lower() for keyword in question_keywords):
                print("Question keywords not found in document. Forcing 'No'.")
                return False
            return grading_result["binary_score"] == "Yes"
        else:
            print("Invalid JSON structure or missing 'binary_score' key.")
            return False
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        return False  # If the model fails to return valid JSON, assume irrelevance.  # If the model fails to return valid JSON, assume irrelevance.

# Main Logic for Routing
question = "what is the qualification requirement for tescra achnet software role?"
docs = retriever.invoke(question)



if is_relevant(docs, question):
    docs_txt = format_docs(docs)
    rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
    generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
    answer = generation.content
else:
    print("No relevant context found. Searching the web using TAVIly...")
    tavili_results = tavili_search(question)  # Use TAVIly search
    answer = tavili_results  # Use TAVIly results as fallback

print(answer)


Embedding texts:   0%|          | 0/1 [00:00<?, ?inputs/s]

Embedding texts: 100%|██████████| 1/1 [00:00<00:00,  7.89inputs/s]


Formatted Prompt: Here is the retrieved document: 

 Detailed Job Description 
Job Profile 
• Job Designation: Software Engineer 
• Place of Posting: Bangalore/ Remote 
ACHNET, the world's leading AI-driven marketplace that fosters professional growth and 
collaboration, and is made with love by IITians, also we have been loved by IITians as we have been 
hiring them with the utmost preference in building our employee ecosystems.  
We are seeking skilled and motivated Software Development Engineers (SDEs) to join our agile 
team and contribute to the development of robust and feature-rich web applications. As an SDE at 
ACHNET, you will be a crucial part of our mission to create exceptional user experiences and drive 
innovation in technology. 
Role and Responsibilities:  
As an SDE focusing on web application development, you will be responsible for the entire software 
development lifecycle, from requirements analysis to deployment. You will work collaboratively 
within an agile team

**Function to Generate response only using the context**

In [None]:
### Generate
from langchain.schema import HumanMessage, SystemMessage

# Prompt
rag_prompt = """You are an assistant for question-answering tasks. 

Here is the context to use to answer the question:

{context} 

Think carefully about the above context. 

Now, review the user question:

{question}

Provide an answer to this questions using only the above context. 

Use three sentences maximum and keep the answer concise.

Answer:"""




# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


question = "what is the criteria for getting a branch change?"
docs = retriever.invoke(question)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
print(generation.content)



NameError: name 'retriever' is not defined

In [12]:
### Hallucination Grader

# Hallucination grader instructions
hallucination_grader_instructions = """

You are a teacher grading a quiz. 

You will be given FACTS and a STUDENT ANSWER. 

Here is the grade criteria to follow:

(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 

(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Score:

A score of yes means that the student's answer meets all of the criteria. This is the highest (best) score. 

A score of no means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader prompt
hallucination_grader_prompt = """FACTS: \n\n {documents} \n\n STUDENT ANSWER: {generation}. 

Return JSON with two two keys, binary_score is 'yes' or 'no' score to indicate whether the STUDENT ANSWER is grounded in the FACTS. And a key, explanation, that contains an explanation of the score."""

# Test using documents and generation from above
hallucination_grader_prompt_formatted = hallucination_grader_prompt.format(
    documents=docs_txt, generation=generation.content
)
result = llm_json_mode.invoke(
    [SystemMessage(content=hallucination_grader_instructions)]
    + [HumanMessage(content=hallucination_grader_prompt_formatted)]
)
json.loads(result.content)

{'score': 'yes',
 'key': 'Ensure the STUDENT ANSWER is grounded in the FACTS.',
 'explanation': "The context clearly states that the job requires a Bachelor's degree with a Master's preferred. The student answer aligns with this requirement, ensuring it meets all criteria."}

In [None]:
# ### Answer Grader

# # Answer grader instructions
# answer_grader_instructions = """You are a teacher grading a quiz. 

# You will be given a QUESTION and a STUDENT ANSWER. 

# Here is the grade criteria to follow:

# (1) The STUDENT ANSWER helps to answer the QUESTION

# Score:

# A score of yes means that the student's answer meets all of the criteria. This is the highest (best) score. 

# The student can receive a score of yes if the answer contains extra information that is not explicitly asked for in the question.

# A score of no means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

# Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

# Avoid simply stating the correct answer at the outset."""

# # Grader prompt
# answer_grader_prompt = """QUESTION: \n\n {question} \n\n STUDENT ANSWER: {generation}. 

# Return JSON with two two keys, binary_score is 'yes' or 'no' score to indicate whether the STUDENT ANSWER meets the criteria. And a key, explanation, that contains an explanation of the score."""

# # Test
# question = "What are the traits of a good man?"
# answer = "The Llama 3.2 models released today include two vision models: Llama 3.2 11B Vision Instruct and Llama 3.2 90B Vision Instruct, which are available on Azure AI Model Catalog via managed compute. These models are part of Meta's first foray into multimodal AI and rival closed models like Anthropic's Claude 3 Haiku and OpenAI's GPT-4o mini in visual reasoning. They replace the older text-only Llama 3.1 models."

# # Test using question and generation from above
# answer_grader_prompt_formatted = answer_grader_prompt.format(
#     question=question, generation=answer
# )
# result = llm_json_mode.invoke(
#     [SystemMessage(content=answer_grader_instructions)]
#     + [HumanMessage(content=answer_grader_prompt_formatted)]
# )
# json.loads(result.content)

{'binary_score': 'yes', 'explanation': '...'}

**Fucntion to extract and save the embedding in a vectorbase.json file**

In [20]:
import json

# Extract vectors and document metadata
vector_data = {
    "documents": [doc.page_content for doc in doc_splits],
    "metadata": [doc.metadata for doc in doc_splits],
}

# Save as JSON
with open("vectorstore.json", "w", encoding="utf-8") as f:
    json.dump(vector_data, f, ensure_ascii=False, indent=4)

print("Vector store saved as JSON!")


Vector store saved as JSON!


Processing .json and .txt files, created individual functions for each files (flat and nested .json files)

In [31]:
import json
import os

def process_eatery_data(data):
    #food_reviews.json
    """Processes eatery data and returns a list of dictionaries with review text and metadata."""
    processed_eatery_reviews = []
    for eatery in data:
        eatery_name = eatery["eatery_name"]
        rating = eatery["rating"]
        for review in eatery["reviews"]:
            review_text = review["text"]
            sentiment = review.get("sentiment")  # Use get to handle missing sentiment
            aspects = review.get("aspects", [])  # Use get to handle missing aspects, default to empty list.

            processed_eatery_reviews.append({
                "text": review_text,
                "metadata": {
                    "eatery": eatery_name,
                    "rating": rating,
                    "sentiment": sentiment,
                    "aspects": aspects,
                },
            })
    return processed_eatery_reviews

def process_faq_data(data):
    #tsg_faq.json
    """Processes FAQ data and returns a list of dictionaries with answer text and metadata."""
    processed_faqs = []
    for faq in data["faq"]:
        question = faq["question"]
        answer = faq["answer"]
        processed_faqs.append({
            "text": answer,
            "metadata": {
                "question": question
            }
        })
    return processed_faqs

def process_subject_registration_data(data):
    #faq_ug_reg.json
    """Processes Subject Registration FAQs data and returns a list of dictionaries with reply text and metadata."""
    processed_subject_registration_qas = []
    for category, qa_pairs in data["Subject Registration FAQs"].items():
        for qa_pair in qa_pairs:
            processed_subject_registration_qas.append({
                "text": qa_pair["reply"],
                "metadata": {
                    "question": qa_pair["query"],
                    "category": category  # Add category to metadata
                }
            })
    return processed_subject_registration_qas

def process_IITKGP_Faqs_data(data):
    #iitkgp_faq.json
    """Processes IIT KGP FAQs data and returns a list of dictionaries with reply text and metadata."""
    processed_subject_registration_qas = []
    for category, qa_pairs in data["academic_options"].items():
        for qa_pair in qa_pairs:
            processed_subject_registration_qas.append({
                "text": qa_pair["answer"],
                "metadata": {
                    "question": qa_pair["question"],
                    "category": category  # Add category to metadata
                }
            })
    return processed_subject_registration_qas

def process_small_text_files(folder_path):
    """
    Processes all .txt files in a given folder, assuming each contains small texts 
    (3-4 lines max). Returns a list of dictionaries with text and metadata.
    """
    processed_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    lines = f.readlines()
                    # Remove leading/trailing whitespace and filter out empty lines
                    text = "\n".join(line.strip() for line in lines if line.strip()) 
                    if text:  # Only append if the text is not empty
                        processed_texts.append({
                            "text": text,
                            "metadata": {"source": filename}
                        })
            except FileNotFoundError:
                print(f"File not found: {file_path}")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
    return processed_texts

processed_data = []

# Load and process eatery data
if os.path.exists("food_review\food_reviews.json"):
    with open("food_reviews.json", "r") as f:
        eatery_data = json.load(f)
    processed_data.extend(process_eatery_data(eatery_data))

# Load and process FAQ data
if os.path.exists("tsg_faqs.json"):
    with open("tsg_faqs.json", "r") as f:
        faq_data = json.load(f)
    processed_data.extend(process_faq_data(faq_data))

# Load and process Subject Registration FAQs data
if os.path.exists("resources\faq_ug_reg.json"):
    with open("faq_ug_reg.json", "r") as f:
        subject_registration_data = json.load(f)
    processed_data.extend(process_subject_registration_data(subject_registration_data))

if os.path.exists("resources\iitkgp_faqs.json"):
    with open("resources\iitkgp_faqs.json", "r", encoding="utf-8") as f:
        iitkgp_fqs_data = json.load(f)
    processed_data.extend(process_IITKGP_Faqs_data(iitkgp_fqs_data))

# Load and process text files (example: factual_text.txt)
if os.path.exists("text_files"):
    processed_data.extend(process_small_text_files("text_files"))

# Now processed_data contains all data in a unified format
# ... chunking, embedding, and indexing ...

print(processed_data[0]) #prints the first entry.

{'text': "From the **third semester onwards**, undergraduate students, including 2-year MSc students, have the opportunity to opt for additional subjects, contingent on the established rules and regulations [1]. Additional subjects are designed to allow students to explore areas of interest and acquire knowledge that may be beneficial for their future careers [1]. To be eligible for additional subjects, a student must maintain a **Cumulative Grade Point Average (CGPA) of 7.50 or greater, without any backlogs** [1]. Students are allowed to register for additional subjects even with **EAA (Essential Academic Activities) as a backlog** [2]. The number of additional credits a student can earn is capped at **33% of the credit requirements for their major** [2]. A student's registration for an additional subject is subject to several conditions: they must satisfy any **pre-requisites** for the course, there should be **no timetable conflicts**, and the **class size** must permit their enrolm

In [24]:
processed_data

[{'text': "From the **third semester onwards**, undergraduate students, including 2-year MSc students, have the opportunity to opt for additional subjects, contingent on the established rules and regulations [1]. Additional subjects are designed to allow students to explore areas of interest and acquire knowledge that may be beneficial for their future careers [1]. To be eligible for additional subjects, a student must maintain a **Cumulative Grade Point Average (CGPA) of 7.50 or greater, without any backlogs** [1]. Students are allowed to register for additional subjects even with **EAA (Essential Academic Activities) as a backlog** [2]. The number of additional credits a student can earn is capped at **33% of the credit requirements for their major** [2]. A student's registration for an additional subject is subject to several conditions: they must satisfy any **pre-requisites** for the course, there should be **no timetable conflicts**, and the **class size** must permit their enrol

Simpler chunking strategy

In [28]:
import nltk  # For sentence tokenization

nltk.download('punkt_tab')  # Download sentence tokenizer data

def topic_and_sentence_chunking(processed_data):
    """Chunks text in processed_data based on topics and sentences."""
    chunked_data = []
    for entry in processed_data:
        text = entry["text"]
        metadata = entry["metadata"].copy()  # Create a copy to avoid modifying the original

        if "source" in metadata and metadata["source"].endswith(".txt"):  # Apply topic chunking to .txt files
            lines = text.split('\n')
            current_topic = None
            current_topic_text = ""

            for line in lines:
                line = line.strip()
                if not line:
                    continue  # Skip empty lines

                if line.endswith(':'):
                    if current_topic:
                        sentences = nltk.sent_tokenize(current_topic_text)
                        for sentence in sentences:
                            if sentence.strip():
                                chunked_data.append({
                                    "text": sentence.strip(),
                                    "metadata": {"topic": current_topic, **metadata} #merge metadata
                                })
                        current_topic_text = ""
                    current_topic = line[:-1].strip()
                else:
                    current_topic_text += " " + line

            if current_topic and current_topic_text.strip():
                sentences = nltk.sent_tokenize(current_topic_text)
                for sentence in sentences:
                    if sentence.strip():
                        chunked_data.append({
                            "text": sentence.strip(),
                            "metadata": {"topic": current_topic, **metadata} #merge metadata
                        })

        elif "answer" in entry["text"] or "question" in metadata: #apply sentence chunking to json files
          sentences = nltk.sent_tokenize(text)
          for sentence in sentences:
            if sentence.strip():
              chunked_data.append({
                "text": sentence.strip(),
                "metadata": metadata
              })

        else: #keep all other entries as they are
            chunked_data.append(entry)

    return chunked_data

# Assuming processed_data is already populated as in your code
processed_chunked_data = topic_and_sentence_chunking(processed_data)

# Example: Print the first few chunked items
for i in range(min(5, len(processed_chunked_data))):
    print(processed_chunked_data[i])

{'text': 'From the **third semester onwards**, undergraduate students, including 2-year MSc students, have the opportunity to opt for additional subjects, contingent on the established rules and regulations [1].', 'metadata': {'question': 'What is an additional subject, and what are the rules regarding an additional subject?'}}
{'text': 'Additional subjects are designed to allow students to explore areas of interest and acquire knowledge that may be beneficial for their future careers [1].', 'metadata': {'question': 'What is an additional subject, and what are the rules regarding an additional subject?'}}
{'text': 'To be eligible for additional subjects, a student must maintain a **Cumulative Grade Point Average (CGPA) of 7.50 or greater, without any backlogs** [1].', 'metadata': {'question': 'What is an additional subject, and what are the rules regarding an additional subject?'}}
{'text': 'Students are allowed to register for additional subjects even with **EAA (Essential Academic Ac

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Hybrid chunking with some improvements

In [None]:
import nltk  # For sentence tokenization

nltk.download('punkt_tab')  # Download sentence tokenizer data

def topic_and_sentence_chunking(processed_data):
    """Chunks text in processed_data based on topics and sentences."""
    chunked_data = []
    for entry in processed_data:
        text = entry["text"]
        metadata = entry["metadata"].copy()  # Create a copy to avoid modifying the original

        if "source" in metadata and metadata["source"].endswith(".txt"):  # Apply topic chunking to .txt files
            lines = text.split('\n')
            current_topic = None
            current_topic_text = ""

            for line in lines:
                line = line.strip()
                if not line:
                    continue  # Skip empty lines

                # Detect topic (lines ending with ':' or ALL CAPS headings)
                if line.endswith(':') or line.isupper():
                    if current_topic:
                        # Split topic text into sentences and create chunks
                        sentences = nltk.sent_tokenize(current_topic_text)
                        for sentence in sentences:
                            if sentence.strip():
                                chunked_data.append({
                                    "text": sentence.strip(),
                                    "metadata": {"topic": current_topic, **metadata}  # Merge metadata
                                })
                        current_topic_text = ""
                    current_topic = line[:-1].strip() if line.endswith(':') else line.strip()
                else:
                    current_topic_text += " " + line

            # Handle the last topic
            if current_topic and current_topic_text.strip():
                sentences = nltk.sent_tokenize(current_topic_text)
                for sentence in sentences:
                    if sentence.strip():
                        chunked_data.append({
                            "text": sentence.strip(),
                            "metadata": {"topic": current_topic, **metadata}  # Merge metadata
                        })

        elif "answer" in entry["text"] or "question" in metadata:  # Apply sentence chunking to Q&A content
            sentences = nltk.sent_tokenize(text)
            for sentence in sentences:
                if sentence.strip():
                    # Add question to metadata if available
                    new_metadata = metadata.copy()
                    if "question" in metadata:
                        new_metadata["question"] = metadata["question"]
                    chunked_data.append({
                        "text": sentence.strip(),
                        "metadata": new_metadata
                    })

        else:  # Keep all other entries as they are
            chunked_data.append(entry)

    return chunked_data

# Example usage
processed_chunked_data = topic_and_sentence_chunking(processed_data)

# Print the first few chunked items
for i in range(min(5, len(processed_chunked_data))):
    print(processed_chunked_data[i])

{'text': 'From the **third semester onwards**, undergraduate students, including 2-year MSc students, have the opportunity to opt for additional subjects, contingent on the established rules and regulations [1].', 'metadata': {'question': 'What is an additional subject, and what are the rules regarding an additional subject?'}}
{'text': 'Additional subjects are designed to allow students to explore areas of interest and acquire knowledge that may be beneficial for their future careers [1].', 'metadata': {'question': 'What is an additional subject, and what are the rules regarding an additional subject?'}}
{'text': 'To be eligible for additional subjects, a student must maintain a **Cumulative Grade Point Average (CGPA) of 7.50 or greater, without any backlogs** [1].', 'metadata': {'question': 'What is an additional subject, and what are the rules regarding an additional subject?'}}
{'text': 'Students are allowed to register for additional subjects even with **EAA (Essential Academic Ac

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Function to embed the processed_chunked_data

In [34]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the Sentence Transformer model
model = SentenceTransformer('all-mpnet-base-v2') #a good general purpose model.

def embed_chunks(chunked_data):
    """Embeds the text in chunked_data using Sentence Transformers."""
    embedded_data = []
    for entry in chunked_data:
        embedding = model.encode(entry["text"])
        embedded_data.append({
            "embedding": embedding.tolist(),  # Convert numpy array to list for JSON serialization
            "text": entry["text"],
            "metadata": entry["metadata"]
        })
    return embedded_data

# Assuming processed_chunked_data is already populated
embedded_data = embed_chunks(processed_chunked_data)

# Example: Print the first embedded item
print(embedded_data[0])

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


{'embedding': [0.009000413119792938, -0.006995189934968948, -0.02166716754436493, -0.03337167575955391, -0.019558876752853394, 0.03985268622636795, 0.037032417953014374, -0.004523812793195248, 0.028543390333652496, 0.0116368243470788, 0.06629471480846405, -0.0013004661304876208, -0.022470910102128983, 0.021916117519140244, 0.03274707496166229, -0.013073545880615711, 0.03575897961854935, -0.04825406149029732, -0.09341385215520859, -0.02244574762880802, -0.021044088527560234, 0.012914509512484074, -0.052560534328222275, -0.038265928626060486, 0.07312196493148804, -0.006599404849112034, 0.002231653081253171, 0.0025817984715104103, -0.030674153938889503, 0.0015197135508060455, 0.03988062962889671, 0.03167644888162613, 0.020262403413653374, -0.004446287639439106, 2.1647169887728523e-06, -0.06071391701698303, -0.05157018080353737, 0.0063461423851549625, -0.043238576501607895, 0.000779717811383307, 0.023992864415049553, 0.0601343959569931, 0.05596589669585228, 0.006336602382361889, -0.0200903

For deleting the chromadb instance and vectorbase

In [38]:
import chromadb

client = chromadb.PersistentClient(path="./vectorized_db")
client.delete_collection(name="iitkgp_data")
print("Collection 'iitkgp_data' deleted.")

Collection 'iitkgp_data' deleted.


Function for creating and saving vector database

In [39]:
import chromadb
from chromadb.utils import embedding_functions
import uuid #import uuid

# Initialize ChromaDB client with persistence
client = chromadb.PersistentClient(path="./vectorized_db")

# Create Embedding Function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Create a collection with persistence
collection = client.create_collection(name="iitkgp_data", embedding_function=sentence_transformer_ef)

# Assuming 'embedded_data' is your list of dictionaries with embeddings, text, and metadata
for entry in embedded_data:
    collection.add(
        embeddings=[entry["embedding"]],
        documents=[entry["text"]],
        metadatas=[entry["metadata"]],
        ids=[str(uuid.uuid4())] #add ids
    )

print("ChromaDB collection created, populated, and saved.")

ChromaDB collection created, populated, and saved.


In [59]:
import chromadb
from chromadb.utils import embedding_functions
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings # using hugging face embeddings to match chromadb

# Initialize ChromaDB client with persistence
client = chromadb.PersistentClient(path="./vectorized_db")

# Create Embedding Function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

# Load the collection
collection = client.get_collection(name="iitkgp_data", embedding_function=sentence_transformer_ef)

# Create LangChain embeddings object (using HuggingFaceEmbeddings to match chromadb embeddings)
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

# Create LangChain Chroma vectorstore
vectorstore = Chroma(client=client, collection_name="iitkgp_data", embedding_function=embeddings)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [66]:
from langchain.schema import HumanMessage

# Initialize LLM (replace with your chosen LLM)
#you will need to have your openai key set as an environment variable

# Prompt
rag_prompt = """You are an assistant for question-answering tasks. 

Here is the context to use to answer the question:

{context} 

Think carefully about the above context. 

Now, review the user question:

{question}

Provide an answer to this questions using only the above context. 

Use three sentences maximum and keep the answer concise.

Answer:"""

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

question = "What is the rating of Cafe coffee day?"
docs = retriever.invoke(question)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
print(generation.content)

<think>
Okay, so I need to figure out how to determine the rating of Cafe coffee day based on the given context. Let me read through the context again carefully.

The context mentions that Micro-Credit courses can be held during evenings, weekdays, or weekends. It also talks about Micro-Credit courses being one-credit courses lasting 3-14 days and offered during the semester or in available free slots. The grade is indicated under semester 9.

Then there's a section about students admitted to the 4-year B.Tech.(Hons.) program, which states that they must have a minimum CGPA of 7.0 without any backlog at the time of application.

The question is asking for the rating of Cafe coffee day. Hmm, I don't see any mention of cafes or coffee days in this context. The focus seems to be on Micro-Credit courses and B.Tech students' academic requirements.

Wait, maybe there's a typo or something missing. Perhaps it's referring to a specific institution that provides these services, but the provided

In [67]:
def display_retriever_contents(retriever, query):
    """
    Displays the contents that the retriever extracts from the vector database.

    Args:
        retriever: The LangChain retriever object.
        query: The query string to use for retrieval.
    """
    docs = retriever.invoke(query)
    if not docs:
        print("Retriever found no relevant documents.")
        return

    print("Retriever Contents:\n")
    for i, doc in enumerate(docs):
        print(f"Document {i + 1}:")
        print(f"  Page Content: {doc.page_content}")
        print(f"  Metadata: {doc.metadata}")
        print("-" * 40)
question = "What is the rating of Cafe coffee day?"
# Example Usage (assuming 'retriever' and 'question' are already defined)
display_retriever_contents(retriever, question) #use the same question variable from your other code.

Retriever Contents:

Document 1:
  Page Content: They can be held during evenings, weekdays, or weekends.
  Metadata: {'category': 'micro_credits', 'question': 'What are Micro-Credit courses?'}
----------------------------------------
Document 2:
  Page Content: Micro-Credit courses are one-credit courses lasting 3-14 days, offered during the semester or in available free slots.
  Metadata: {'category': 'micro_credits', 'question': 'What are Micro-Credit courses?'}
----------------------------------------
Document 3:
  Page Content: The grade will be indicated in the transcript under semester 9.
  Metadata: {'category': 'semester_away_project_program_sapp', 'question': 'How is the final evaluation conducted for SAPP?'}
----------------------------------------
Document 4:
  Page Content: Students admitted to the 4-year B.Tech.(Hons.)
  Metadata: {'category': 'switchover_dual_degree', 'question': 'What are the options for switching over to Dual Degree programs?'}
------------------------