In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
PDF_PATH = r"D:\Projects\debyez\data\raw\TH Delhi 07-12-2025.pdf"

In [3]:
#loading pdf
documents = []
loader  = PyPDFLoader(PDF_PATH)
docs = loader.load()
documents.extend(docs) #if we have multiple pdfs
print(f"Loaded {len(documents)} pages from PDF")

Loaded 20 pages from PDF


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Split documents into smaller chunks for better retrieval
# chunk_size: Maximum size of each chunk (in characters)
# chunk_overlap: Overlap between chunks to maintain context
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512, 
    chunk_overlap=100
)
splits = text_splitter.split_documents(documents)
print(f"Splitt into {len(splits)} chunks")

Splitt into 546 chunks


In [5]:
#model to do embeddings
#selected the embedding model BAAI/bge-base-en-v1.5
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("BAAI/bge-base-en-v1.5")
embeddings = HuggingFaceEmbeddings(model_name = "BAAI/bge-base-en-v1.5")
print("Embedding models loaded")

Embedding models loaded


In [6]:
import weaviate
with weaviate.connect_to_local() as client: 
    print(client.is_ready())
client.close()

True


In [7]:
print(client)

<weaviate.client.WeaviateClient object at 0x0000017127757F90>


In [8]:
from langchain_weaviate import WeaviateVectorStore
client.connect()
db = WeaviateVectorStore.from_documents(splits, embeddings, client=client, index_name="Newspaperchunks")

In [9]:
print(db)

<langchain_weaviate.vectorstores.WeaviateVectorStore object at 0x0000017175C7D9D0>


In [10]:
client.collections.exists("Newspaperchunks")# checking if table exists

True

In [11]:
collections = client.collections.list_all()
for c in collections:
    print(c)# listing all tables in the db

Newspaperchunks


In [12]:
collections = client.collections.get("Newspaperchunks")# handle to the table we can use to do query and modify data

In [13]:
response = collections.aggregate.over_all(total_count=True)
print(response) # number of rows that exist in the db

AggregateReturn(properties={}, total_count=1638)


In [14]:
collections.query.fetch_objects(limit = 10)#like fetching the first 10 rows

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('008992fa-7b61-460f-a185-36d1e30c79f1'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'page': 13.0, 'creator': 'pdf-lib (https://github.com/Hopding/pdf-lib)', 'source': 'D:\\Projects\\debyez\\data\\raw\\TH Delhi 07-12-2025.pdf', 'total_pages': 20.0, 'producer': 'pdf-lib (https://github.com/Hopding/pdf-lib); modified using iText¬Æ 5.5.13 ¬©2000-2018 iText Group NV (AGPL-version)', 'moddate': datetime.datetime(2025, 12, 7, 7, 13, 45, tzinfo=datetime.timezone(datetime.timedelta(seconds=19800))), 'page_label': '14', 'text': 'this data oÔ¨Äers little comfort for the future.\nIndia‚Äôs manufacturing Purchasing Managers‚Äô\nIndex (PMI) has fallen to a nine-month low, and\nthe new export orders sub-index has slipped to a\n13-month low, suggesting the worst of the tariÔ¨Ä\npain may be yet to come.\nImports surg

In [15]:
import pandas as pd

# Get the collection we have collections handle
# Fetch rows
response = collections.query.fetch_objects(limit=50)

rows = []

for obj in response.objects:
    row = {"id": obj.uuid, **obj.properties,}        # text, source, page, etc each  propertie becomes a column
    rows.append(row)
# Convert to DataFrame
df = pd.DataFrame(rows)

df


Unnamed: 0,id,page,creator,source,total_pages,producer,moddate,text,page_label,creationdate
0,008992fa-7b61-460f-a185-36d1e30c79f1,13.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,this data oÔ¨Äers little comfort for the future....,14,2025-12-06 23:52:45+00:00
1,00dc6fc0-b283-4547-861a-0be370c02605,18.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,ravaged by the catastrophe. \n(Text by Meera S...,19,2025-12-06 23:52:45+00:00
2,00ddf441-8e6b-41d6-9fa4-a495c830316e,1.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,Road.\nAnother message issued a simi-\nlar thr...,2,2025-12-06 23:52:45+00:00
3,010429b6-5728-45fe-9e48-43bda50a55dd,7.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,are a referendum on the\nState government.\nFo...,8,2025-12-06 23:52:45+00:00
4,0123b0d9-d701-44c3-be0e-30524ce8dea0,13.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,"Moscow, remained silent on the hugs and\nbonho...",14,2025-12-06 23:52:45+00:00
5,016fbd03-f367-435b-8ade-67ab1ec4421d,8.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,incident. ‚ÄúSecurity tight-\nened across the to...,9,2025-12-06 23:52:45+00:00
6,017159a8-c795-403f-806a-836e8d2f18e6,7.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,mit aimed at charting a\nroadmap for achieving...,8,2025-12-06 23:52:45+00:00
7,01b4e5c4-6c9e-4edf-9fc9-c42e8d7260ff,13.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,people then have time to adjust... to renegoti...,14,2025-12-06 23:52:45+00:00
8,01d5ade1-4bcb-49d5-a5df-298888d4c19c,3.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,"ing sought in the SIR, ‚Äù Mri-\ntyunjoy Mallick...",4,2025-12-06 23:52:45+00:00
9,01f048bb-93f1-4173-9461-cb1c33816aa8,14.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,"Altman, who previously exulted in his\n/f_irst...",15,2025-12-06 23:52:45+00:00


In [16]:
#retrival phase
query = "Give me news related to Indigo and airline industry?"
query_vector = embeddings.embed_query(query) #embedding the query

In [17]:
# print(query_vector)

In [18]:
response = collections.query.near_vector(near_vector = query_vector, limit = 5, 
                                        return_metadata =["distance", "score"] )

In [19]:
response

<weaviate.collections.classes.internal.GenerativeReturn at 0x171278bc190>

In [20]:
rows = []
for obj in response.objects:
    row = {
        "uuid": obj.uuid, "distance": obj.metadata.distance, **obj.properties #lower the distance the better
    }
    rows.append(row)
df = pd.DataFrame(rows)

df # the responses we got from the db

Unnamed: 0,uuid,distance,page,creator,source,total_pages,producer,moddate,text,page_label,creationdate
0,60410cd8-dadc-4da5-a64e-d043d17b18f0,0.287088,0.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,compared with 700 on Fri-\nday. Normalcy is ex...,1,2025-12-06 23:52:45+00:00
1,26cb80dd-c5c8-4abb-89c6-73d4eee4e30c,0.287088,0.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,compared with 700 on Fri-\nday. Normalcy is ex...,1,2025-12-06 23:52:45+00:00
2,91f089e0-b53d-4f89-90aa-64a5e349bd5a,0.287088,0.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,compared with 700 on Fri-\nday. Normalcy is ex...,1,2025-12-06 23:52:45+00:00
3,fdaaa309-62a4-4b1d-93c9-8feda7f9cc7c,0.292311,0.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,Vol. 15 /L50539No. 49\nRegd. DL(ND)-11/6110/20...,1,2025-12-06 23:52:45+00:00
4,e0910367-cebd-4f7f-9552-1df3f2d5d519,0.292311,0.0,pdf-lib (https://github.com/Hopding/pdf-lib),D:\Projects\debyez\data\raw\TH Delhi 07-12-202...,20.0,pdf-lib (https://github.com/Hopding/pdf-lib); ...,2025-12-07 07:13:45+05:30,Vol. 15 /L50539No. 49\nRegd. DL(ND)-11/6110/20...,1,2025-12-06 23:52:45+00:00


In [21]:
# for val in client.collections.list_all():
#     client.collections.delete(val)

In [22]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
#building a query system with history
context_q_sys_prompt = (
    """Given a chat history and the latest user question 
    which might reference context in the chat history, formulate a standalone 
    question which can be understood without the chat history. Do NOT answer the 
    question, just reformulate if required and otherwise return as is."""
)
context_prompt = ChatPromptTemplate.from_messages([
    ("system", context_q_sys_prompt), MessagesPlaceholder("chat_history"), ("human", "{input}")])
#we include the system prompt whcih takes the earlier context and converts the human input into a meaningful question based
#on the earlier questions by user. 

In [23]:
from langchain_classic.chains import create_history_aware_retriever, create_retrieval_chain

In [24]:
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="mistral", temperature=0,) # initialize mistral as llm

In [25]:
retriever = db.as_retriever() # our retriever

In [26]:
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, context_prompt
)   # running vector retrieval on the rewritten query.

In [27]:
#now we take the retrieved content and generate the answer
sys_prompt = (
    """You are an assistant for question-answering-tasks.
    Use the following peices of retrieved context to answer the question.
    If you don't know the answer, say that you don't know. Keep the answer concise.\n\n {context}"""
)
qa_prompt = ChatPromptTemplate.from_messages([
    ("system", sys_prompt), MessagesPlaceholder("chat_history"), ("human", "{input}")])
    

In [28]:
# Import all required libraries
from langchain_classic.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory

In [29]:
# Dictionary to store chat histories for different sessions
# In Jupyter, this persists for the entire notebook session
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    """
    Retrieves or creates a chat history for a given session.
    
    Args:
        session_id: Unique identifier for the conversation session
    
    Returns:
        ChatMessageHistory object containing the conversation history
    """
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

print("Session history management setup complete")

Session history management setup complete


In [30]:

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
print("QA chain created")

QA chain created


In [31]:
# Combine retrieval and QA into a single chain
rag_chain = create_retrieval_chain(
    history_aware_retriever, 
    question_answer_chain
)

In [32]:
def ask_question(question, session_id="default_session"):
    """
    Ask a question to the RAG chatbot.
    
    Args:
        question: Your question as a string
        session_id: Session identifier (use same ID to maintain conversation context)
    
    Returns:
        Dictionary containing the answer and other metadata
    """
    # Invoke the conversational RAG chain
    response = conv_rag_chain.invoke(
        {"input": question},
        config={"configurable": {"session_id": session_id}},
    )
    
    # Display the answer
    print(f"\nüìù Question: {question}")
    print(f"üí° Answer: {response['answer']}\n")
    
    # Optionally show the chat history
    session_history = get_session_history(session_id)
    print(f"üìö Total messages in history: {len(session_history.messages)}")
    
    return response

In [33]:
# Wrap the chain with message history management
conv_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",  # Fixed: was "chathistory"
    output_messages_key="answer"
)
print("Complete conversational RAG chain ready")
print("\n" + "="*60)
print("Setup complete! You can now ask questions.")
print("="*60 + "\n")

Complete conversational RAG chain ready

Setup complete! You can now ask questions.



In [34]:
#install ollama and download mistral

In [35]:
# Now you can ask questions! Run this cell multiple times with different questions

#ask_question("What is the name of the newspaper")

#ask_question("Is there any info on the stock market?")

#ask_question("Summarize the editorial for me.")

#ask_question("Any news about the state of Kerala?")

#Checking context
ask_question("What was my last question?")


ResponseError: llama runner process has terminated: CUDA error (status code: 500)

In [None]:
# Run this cell to start a fresh conversation
def reset_conversation(session_id="default_session"):
    """Clear the chat history for a session."""
    if session_id in store:
        del store[session_id]
        print(f"‚úì Conversation reset for session: {session_id}")
    else:
        print(f"No conversation found for session: {session_id}")


reset_conversation()