In [1]:
## Imports
import os
import shutil
from dotenv import load_dotenv

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader, TextLoader, JSONLoader, BSHTMLLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

In [2]:
# Load .env file variables
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
LANGSMITH_TRACING = os.getenv("LANGSMITH_TRACING")
LAMGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")
LANGSMITH_PROJECT = os.getenv("LANGSMITH_PROJECT")
LAMGSMITH_ENDPOINT = os.getenv("LAMGSMITH_ENDPOINT")

if not GOOGLE_API_KEY:
    raise ValueError("Please set the GOOGLE_API_KEY in your .env file")

In [None]:
## Configuration
folder_path = "../documents"
persist_directory = "../chroma_db"
collection_name = "all_documents"
chunk_size = 600
chunk_overlap = 100

In [4]:
## Clean up old vector store and start fresh, only run this code if necessary
if os.path.exists(persist_directory):
    shutil.rmtree(persist_directory, ignore_errors=True)

In [None]:
## Instructions for dynamic prompting
instructions = {
    "Factual": """
    Your task is to answer the question based ONLY on the provided context. 
    Extract specific, accurate information directly from the text. 
    If the answer is not found, clearly state that the information is unavailable. 
    Provide a complete and well-formed answer in full sentences.
    """,
    "Interpretive": """
    Your task is to provide a thoughtful interpretive answer by synthesizing the information in the provided context. 
    Draw meaningful connections, explain implications, and summarize broader themes or significance. 
    Make your answer clear, coherent, and complete, even if the information is scattered.
    """
}

## Classification prompt to detect question type using few-shot examples
question_classifier_prompt = PromptTemplate(
    input_variables=["query"],
    template="""
You are a classifier that determines whether a user question is Factual or Interpretive.

- A *Factual* question asks for specific information found directly in a document (e.g., names, dates, techniques, facts).
- An *Interpretive* question asks for broader meaning, implications, or synthesis (e.g., summaries, themes, significance).

Respond with only the single word: Factual or Interpretive.

Examples:

Question: What year was Nikola Tesla born?
Classification: Factual

Question: What is this paper mainly about?
Classification: Interpretive

Question: Who are the key figures mentioned in this research?
Classification: Factual

Question: How does this paper relate to broader trends in machine learning?
Classification: Interpretive

Now classify the following question:
Question: {query}
Classification:
"""
)

## Add a topic classifier to detect question topic using few-shot examples
topic_classifier_prompt = PromptTemplate(
    input_variables=["query"],
    template="""
You are a classifier that assigns a topic label to a question. 
Choose the single most relevant topic from this exact list:

- Technology
- People
- Science
- Literature
- Other

Respond with ONLY the topic name (one of the above) and nothing else.

Examples:

Question: What are common techniques used in machine learning?
Topic: Technology

Question: What was Alan Turing's contribution to computer science?
Topic: People

Question: What are some of the impacts of climate change?
Topic: Science

Question: What are some popular books published after 2000?
Topic: Literature

Question: What is a good cheese to pair with red wine?
Topic: Other

Now classify this question:
Question: {query}
Topic:
"""
)

## Base template for passing in instructions, context, and a user question
base_prompt_template = PromptTemplate(
    input_variables=["instruction", "context", "question"],
    template="""
    \"\"\"
    You are an intelligent document assistant.
    {instruction}

    Context:
    {context}

    Question: 
    {question}

    Answer in complete sentences, using ONLY the information given in the context. 
    If the context does not contain the answer, say so clearly.

    Answer:
    \"\"\"
    """
)

In [6]:
## Load Ollama LLM (local model)
llm = OllamaLLM(model="gemma3")

## Initialize chain for question classification (factual or interpretive)
question_classifier_chain = question_classifier_prompt | llm | StrOutputParser()
## Initialize chain for question topic classification (based on the topic classifier prompt above)
topic_classifier_chain = topic_classifier_prompt | llm | StrOutputParser()

In [None]:
## Assign each file/document to a particular topic based on extension
def assign_topic(path):
    if path.endswith(".pdf"):
        return "Technology"
    elif path.endswith(".txt"):
        return "People"
    elif path.endswith(".html"):
        return "Science"
    elif path.endswith(".json"):
        return "Literature" 
    else:
        return f"Unknown: This file type is unsupported {path}, so a topic was not assigned."

## Split each document type into chunks
def load_and_split_file(path, splitter):
    should_split = True # All documents other than JSON files should be split/chunked
                        # using a recursive character splitter 
    if path.endswith(".pdf"):
        loader = PyPDFLoader(path)
    elif path.endswith(".txt"):
        loader = TextLoader(path, encoding="utf-8")
    elif path.endswith(".html"):
        loader = BSHTMLLoader(path, open_encoding="utf-8")
    elif path.endswith(".json"):
        # Extract all book fields into a single formatted text using a JQ query expression.
        # This is so that specific data from a JSON file can be effectively parsed and extracted,
        # resulting in a clean, readable text representation that is already "chunked".
        jq_schema = (
            '.books[] | "Title: " + .title + '
            '"\\nAuthor: " + .author + '
            '"\\nGenre: " + .genre + '
            '"\\nPublication Year: " + (.publication_year | tostring) + '
            '"\\nDescription: " + .description + '
            '"\\nPositive Review: " + .reviews.positive + '
            '"\\nNegative Review: " + .reviews.negative'
        )

        loader = JSONLoader(
            file_path=path,
            jq_schema=jq_schema,
            text_content=False
        )

        should_split = False

    else:
        raise ValueError(f"Skipping unsupported file type: {path}")

    docs = loader.load()
    # Assign a topic label for filtering or classification, based on filename
    topic = assign_topic(os.path.basename(path))

    for doc in docs:
        doc.metadata["source"] = os.path.basename(path)
        doc.metadata["topic"] = topic # add topic metadata here

    if should_split:
        return splitter.split_documents(docs)
    else:
        return docs


In [None]:
## Store chunks from PDF and non-PDF files
pdf_chunks = []
non_pdf_chunks = []

## Text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

## Embedding models:
## Here all PDF files will use Google API embeddings, and all other files will use Hugging Face embeddings
google_embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001", google_api_key = GOOGLE_API_KEY)
hf_embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

## Split documents into chunks and add to appropriate list
for filename in os.listdir("documents"):
    file_path = os.path.join("documents", filename)
    chunks = load_and_split_file(file_path, text_splitter)

    if filename.lower().endswith(".pdf"):
        pdf_chunks.extend(chunks)
    else:
        non_pdf_chunks.extend(chunks)

## Store PDFs with Google embeddings
vector_store_pdf = Chroma.from_documents(documents = pdf_chunks,
                                         embedding = google_embeddings,
                                         collection_name = "pdf-collection",
                                         persist_directory = os.path.join(persist_directory, "pdf")
                                         )

## Store non-PDFs with Hugging Face embeddings
vector_store_non_pdf = Chroma.from_documents(documents = non_pdf_chunks,
                                             embedding = hf_embeddings,
                                             collection_name = "nonpdf-collection",
                                             persist_directory = os.path.join(persist_directory, "nonpdf")
                                             )

# vector_store.persist() # not needed as of the latest version of chroma_db

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
## Main function using RetrievalQA
def answer_question_with_retrievalqa(query: str, filename: str = None, topic: str = None, 
                                     question_type: str = None, k = 3) -> dict:

    """
    Answers a user query using dynamic retrieval and prompting.

    - If `filename` is provided, filters chunks from that specific document.
    - If `topic` is provided, filters chunks by topic metadata.
    - If neither is provided, auto-detects topic using a topic classifier.

    Also classifies the query as 'Factual' or 'Interpretive' to adjust the prompt.

    Args:
        query (str): The user's question.
        filename (str, optional): Name of the file to restrict retrieval to.
        topic (str, optional): Topic label to restrict document retrieval.
        k (int): Number of top chunks to retrieve.

    Returns either:
        - dict --> result: The LLM-generated answer.
        - dict --> result: a warning message if no relevant documents are found.

    """
    # Classify the question type is not explicitally provided
    if question_type is None:
        try:
            # Classify the question type as either "factual" or "interpretive"
            question_type = question_classifier_chain.invoke({"query": query}).strip()
            print(f"Question Type Detected: {question_type}")

        except Exception as e:
            print(f"Classifier error: {e}. Defaulting to Interpretive.")
            question_type = "Interpretive" # Fallback option if an exception occurs

    else:
        # Normalize user input
        question_type = question_type.strip().capitalize()
        if question_type not in ["Factual", "Interpretive"]:
            print(f"Warning: Unsupported question_type '{question_type}'. Defaulting to Interpretive.")
            question_type = "Interpretive"
        else:
            print(f"Question Type Provided: {question_type}")

    # Detect the topic of the query if not explicitally provided
    if not filename and not topic:
        try:
            topic = topic_classifier_chain.invoke({"query": query}).strip()
            print(f"Inferred Topic: {topic}")
        except Exception as e:
            print(f"Topic classifier error: {e}. Skipping topic filter.")

    # Early check for the "Other" topic to short-circuit retrieval, as no documents used in
    # creating the database fall under this category
    if topic and topic.lower() == "other":
        return {"result": "Your question falls under the 'Other' category, which is outside the scope of the available documents."}

    # Build metadata filter for the retriever depending on whether a filename or
    # topic is provided by the user (or inferrred by the LLM)
    filter_by = None
    if filename:
        filter_by = {"source": filename}
    elif topic:
        filter_by = {"topic": topic}

    # Select instruction and apply to prompt
    instruction_text = instructions.get(question_type, instructions["Interpretive"])
    dynamic_prompt = base_prompt_template.partial(instruction=instruction_text)

    # Set up retriever with an optional filter
    search_kwargs = {"k": k}
    if filter_by:
        search_kwargs["filter"] = filter_by

    # Mapping each file extension to respective vector store
    filetype_to_vector_store = {
        ".pdf": vector_store_pdf,       # PDF --> Technology
        ".txt": vector_store_non_pdf,   # TEXT --> People
        ".html": vector_store_non_pdf,  # HTML --> Science
        ".json": vector_store_non_pdf,  # JSON --> Literature
    }

    # Mapping each topic to respective vector store
    topic_to_vector_store = {
        "technology": vector_store_pdf,
        "science": vector_store_non_pdf,
        "people": vector_store_non_pdf,
        "literature": vector_store_non_pdf,
    }

    # Determine vector store based on filename or topic
    if filename:
        ext = os.path.splitext(filename)[1].lower()
        vector_store = filetype_to_vector_store.get(ext, vector_store_non_pdf)  # fallback if unknown ext
    else:
        topic_normalized = topic.lower() if topic else ""
        vector_store = topic_to_vector_store.get(topic_normalized, vector_store_non_pdf)  # fallback

    retriever = vector_store.as_retriever(search_kwargs=search_kwargs)


    # Relevance check: Get relevant docs
    relevant_docs = retriever.invoke(query)

    # If no relevant documents are found in the vector store for some reason, 
    # return a message to the user
    if not relevant_docs:
        return {"result": "Sorry, I couldn't find any documents related to your question. "
                          "Please try asking something else or check the document collection "
                          "to see which topics are likely covered."}

    # Build RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=False,
        chain_type_kwargs={"prompt": dynamic_prompt}
    )
    
    # Execute query
    try:
        result = qa_chain.invoke(query)
        return result
    except Exception as e:
        print(f"Error during QA chain invoke: {e}")
        return {"result": "Sorry, I was unable to process your request due to resource constraints."}
    

## Helper function to ask a question and run pipeline above
def ask_question(query: str, filename=None, topic=None, question_type=None):
    result = answer_question_with_retrievalqa(query, filename=filename, 
                                              topic=topic, question_type=question_type)
    return result["result"]


### Testing Various Queries

In [None]:
query = "What was Alan Turing's biggest contribution to mankind?"

result = ask_question(query)
print(result)

Question Type Detected: Factual
Inferred Topic: People
According to the context, Alan Turing is widely considered to be the father of theoretical computer science, and he formalised the concepts of algorithm and computation with the Turing machine.


In [11]:
query = "What were the results of the study involving mobile devices?"
file = "mobile_devices.pdf"
question_type = "Factual"

result = ask_question(query, filename = file, question_type=question_type)
print(result)

Question Type Provided: Factual
The picture that emerges from the analysis of the collected data regarding the advanced learners' use of mobile devices for learning English is relatively encouraging, and the majority of the interviewees acknowledged the positive impact of using mobile devices for English study.


In [13]:
query = "How do artificial neural networks differ from traditional machine learning models?"
question_type = "Interpretive"
topic = "Technology"

result = ask_question(query, question_type = question_type, topic=topic)
print(result)

Question Type Provided: Interpretive
Artificial neural networks differ from traditional machine learning models in terms of training time and computational expense. Training a deep neural network can take several weeks, requiring significantly more computational power than traditional algorithms which typically take only minutes or hours to train. The amount of computational power needed also depends on the size of the data and the complexity of the network.


In [14]:
query = "Recommend me a great adventure book, and list all of the reviews found along with the author."
question_type = "Interpretive"

result = ask_question(query, question_type = question_type)
print(result)

Question Type Provided: Interpretive
Inferred Topic: Literature
Based on the provided context, I recommend *Life of Pi* by Yann Martel as a great adventure book. The positive review describes it as “inventive storytelling with deep philosophical themes.” Additionally, the negative review notes that the narrative style “may not appeal to everyone.”


In [15]:
query = "Tell me the following things about Nikola Tesla: where and when he was born, when he died, and his biggest achievemt." \
        "Return the output as a bullet point list."
question_type = "Factual"

result = ask_question(query, question_type = question_type)
print(result)

Question Type Provided: Factual
Inferred Topic: People
Here’s the information about Nikola Tesla based solely on the provided context:

*   Nikola Tesla was born on 10 July 1856.
*   He died on 7 January 1943.
*   His biggest achievement was his contributions to the design of the modern alternating current (AC) electricity supply system.


In [16]:
query = "Briefly explain what 1:1 technology is and its impact on students."

result = ask_question(query)
print(result)

Question Type Detected: Interpretive
Inferred Topic: Technology
1:1 technology refers to the phenomenon of providing students with individual devices, like laptops, to enhance their learning. Its impact is that some school districts are adopting it at high rates to help students achieve at higher levels. However, the implementation of 1:1 technology is not yet universal, and it will take many years for all students to have access to it.


In [19]:
query = "What are some issues with the recent Congress energy bill?"
file = "climate_change.html"
question_type = "Interpretive"

result = answer_question_with_retrievalqa(query, filename=file, question_type=question_type)
print(result["result"])

Question Type Provided: Interpretive
The recent Congress energy bill has several issues. It fails to address how the country will make up the gap in electricity needs without more solar, wind, and storage, and it doesn’t consider how working families will manage higher energy bills. Furthermore, both versions of the bill eliminate federal support for wind and solar energy, leading to less energy on the grid and threatening jobs and investments across the country.


In [20]:
query = "What are some things people have said about J.K. Rowling's books?"
topic = "Literature"

result = answer_question_with_retrievalqa(query, topic=topic)
print(result["result"])

Question Type Detected: Interpretive
Some people have said that J.K. Rowling’s books are exciting, magical, and wonderfully expand the Harry Potter universe. However, some fans felt they were darker and less suitable for younger readers.


In [21]:
query = "What is the best red wine to pair with a medium-rare steak?"

result = ask_question(query, question_type = question_type)
print(result)

Question Type Provided: Interpretive
Inferred Topic: Other
Your question falls under the 'Other' category, which is outside the scope of the available documents.


In [22]:
query = "Explain what an IT analyst does in 1-2 sentences."
question_type = "Interpretive"

result = ask_question(query, question_type=question_type)
print(result)

Question Type Provided: Interpretive
Inferred Topic: Technology
The context does not provide information about what an IT analyst does.


## Summarization

In [10]:
## Initialize text splitter (adjust chunk size and overlap as needed)
chunk_size = 800
chunk_overlap = 100

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

## Prompt template for summarization of a document/file
summarization_prompt = PromptTemplate(
    input_variables=["text_chunk"],
    template="Summarize the following text concisely:\n\n{text_chunk}\n\nSummary:"
)

## Build summarization chain
summarization_chain = summarization_prompt | llm

def summarize_file(file_path: str) -> str:
    # Load and split file into chunks using existing function from before
    chunks = load_and_split_file(file_path, text_splitter)

    # Summarize each chunk individually
    summaries = []
    for chunk in chunks:
        summary = summarization_chain.invoke({"text_chunk": chunk.page_content})
        summaries.append(summary.strip())

    # Combine all chunk summaries into one final summary
    combined_summary_text = " ".join(summaries)
    final_summary = summarization_chain.invoke({"text_chunk": combined_summary_text})

    return final_summary


In [11]:
## Sample test using an HTML article in the documents folder
file_path = "./documents/climate_change.html"
summary = summarize_file(file_path)


<function summarize_file at 0x0000022628CBC360>


In [12]:
print(summary)

Here’s a concise summary of the text:

The current US budget bill fails to address rising electricity demand, relying too heavily on fossil fuels. This, coupled with limited component supplies and proposed cuts to clean energy incentives, threatens higher energy costs, potential blackouts, and hinders the growth of renewable energy sources. **Action is needed to preserve clean energy incentives.**
