In [1]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd

In [4]:
import os
from langchain_openai import AzureChatOpenAI

azure_model = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("OPENAI_API_VERSION"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
)

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

2026-01-12 10:28:33.479212: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from qdrant_client.models import Distance, VectorParams
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")
vector_size = len(embeddings.embed_query("sample text"))

if not client.collection_exists("test"):
    client.create_collection(
        collection_name="test",
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )
vector_store = QdrantVectorStore(
    client=client,
    collection_name="test",
    embedding=embeddings,
)

In [7]:
from langchain_community.document_loaders import PyPDFLoader

all_docs = []
for file in os.listdir("PDFs"):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join("PDFs", file))
        docs = loader.load()
        all_docs.extend(docs)

Multiple definitions in dictionary at byte 0xbbee for key /x1098
Multiple definitions in dictionary at byte 0xbbfe for key /x1099
Multiple definitions in dictionary at byte 0xbc0e for key /x1098
Multiple definitions in dictionary at byte 0xbc1e for key /x1099
Multiple definitions in dictionary at byte 0xbc2e for key /x1098
Multiple definitions in dictionary at byte 0xbc3e for key /x1099


In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

all_splits = text_splitter.split_documents(all_docs)
print(f"Split into {len(all_splits)} sub-documents.")

Split into 2312 sub-documents.


In [9]:
document_ids = vector_store.add_documents(documents=all_splits)
print(document_ids[:3])

['ffc75a0f3b69473a8f6673d9f7f524eb', 'cac66999420d48c4877ee61a01e67b50', '3d9780b0ef28417ea9dacc5b8cecf0e5']


In [49]:
def score_sheet (user_input, to_print=True):
    """
    Must be called alongside user input every time. Returns scores per pdf.
    """
    results = vector_store.similarity_search_with_score(user_input, k=1000)
    
    pdf_best_scores = {}
    for doc, score in results:
        src = doc.metadata["source"]
        pdf_best_scores[src] = max(pdf_best_scores.get(src, 0), score)
    
    files = np.array(list(pdf_best_scores.keys()))
    scores = np.array(list(pdf_best_scores.values()))
    df = pd.DataFrame(np.concatenate((files.reshape(-1,1),scores.reshape(-1,1)), axis=1), columns=['1','2'])

    if to_print:
        print(df)
    return df

In [11]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [56]:
def reranking(user_input, k=2):
    df = score_sheet(user_input)
    df_rerank = df.iloc[:k,0].values

    reranked_docs = []
    for doc in df_rerank:
        loader = PyPDFLoader(doc)
        docs = loader.load()
        reranked_docs.extend(docs)
        
    text_splitter2 = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    reranked_splits = text_splitter.split_documents(reranked_docs)
    
    query_vec = embeddings.embed_query(user_input)
    similarity_value = []
    for i in range(len(reranked_splits)):
        print(i)
        vec2 = embeddings.embed_query(reranked_splits[i].page_content)
        similarity_value.append(cosine_similarity(query_vec, vec2))
    similarity_value = np.array(similarity_value)
    return np.max(similarity_value), reranked_splits[np.argmax(similarity_value)].page_content

In [59]:
from langchain.tools import tool
retrieval_cache = {}

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    key = query.strip().lower()
    if key in retrieval_cache:
        print("Calling from cache...")
        return retrieval_cache[key]

    retrieved_docs = vector_store.similarity_search(query, k=2)
    
    serialized = "\n\n".join(
        f"Source: {doc.metadata}\nContent: {doc.page_content}"
        for doc in retrieved_docs
    )

    score_sheet(query)
    
    result = (serialized, retrieved_docs)
    retrieval_cache[key] = result
    return result

In [15]:
serialized, retrieved_docs = retrieve_context("What is a dog?")

                                                    1                    2
0                            PDFs/Dog - Wikipedia.pdf    0.590761826641792
1                         PDFs/Animal - Wikipedia.pdf  0.38099446675119253
2        PDFs/Artificial intelligence - Wikipedia.pdf   0.2858703229333798
3                        PDFs/Chatbot - Wikipedia.pdf  0.25428517534330786
4               PDFs/Machine learning - Wikipedia.pdf  0.23769084409651214
5                      PDFs/LangChain - Wikipedia.pdf   0.2274744174967486
6   PDFs/Generative artificial intelligence - Wiki...   0.2210317352859998
7                  PDFs/Search engine - Wikipedia.pdf  0.21752973444491774
8           PDFs/Large language model - Wikipedia.pdf   0.2165383670288706
9            PDFs/Supervised learning - Wikipedia.pdf   0.1878379323025521
10                 PDFs/Heat transfer - Wikipedia.pdf  0.15452958853763915
11              PDFs/Search algorithm - Wikipedia.pdf  0.15090143233093228
12  PDFs/Retrieval-augmen

In [46]:
from langchain_core.globals import set_llm_cache
from langchain.agents import create_agent
from langgraph.checkpoint.memory import InMemorySaver
from langchain_core.caches import InMemoryCache

set_llm_cache(InMemoryCache())

tools = [retrieve_context]
prompt = (
    "You have access to a tool that retrieves context from PDF Documents."
    "Use the tool to help answer user queries. Mention to the user when you are using cached data."
)

agent = create_agent(azure_model, tools, system_prompt=prompt)

In [47]:
i='1'
while True:
    user_input = input("Enter Prompt (type 'q' to quit, type 'n' for new conversation): ").strip()
    
    if user_input.lower() in ["q", "quit"]:
        print("Exiting PDF analyst.")
        break

    if user_input.lower() in ["n", "new"]:
        print("What can I do for you?")
        i = str(int(i) + 1)
        continue

    steps = []
    
    for event in agent.stream(
    {"messages": [{"role": "user", "content": user_input}]},
    checkpointer=InMemorySaver(),
    stream_mode="values",
    ):
        event["messages"][-1].pretty_print()

Enter Prompt (type 'q' to quit, type 'n' for new conversation):  what is net zero?



what is net zero?

"Net zero" refers to balancing the amount of greenhouse gases emitted into the atmosphere with an equivalent amount of emissions removed or offset, resulting in a net zero increase in atmospheric greenhouse gases. The goal is to reduce carbon emissions to as close to zero as possible and offset any remaining emissions through measures like carbon capture or reforestation. This concept is central to efforts to combat climate change by limiting global temperature rise. 

Would you like more detailed information or specific examples?


Enter Prompt (type 'q' to quit, type 'n' for new conversation):  what is langchain?



what is langchain?
Tool Calls:
  retrieve_context (call_4H8bQQ5bCM4klN9s3PJweJss)
 Call ID: call_4H8bQQ5bCM4klN9s3PJweJss
  Args:
    query: LangChain
                                                    1                    2
0                      PDFs/LangChain - Wikipedia.pdf   0.7271866299076896
1           PDFs/Large language model - Wikipedia.pdf   0.4358545416032088
2   PDFs/Generative artificial intelligence - Wiki...  0.36982122712988863
3               PDFs/Machine learning - Wikipedia.pdf   0.3412312466703556
4                        PDFs/Chatbot - Wikipedia.pdf  0.30900071473973856
5        PDFs/Artificial intelligence - Wikipedia.pdf  0.30751075532570876
6   PDFs/Retrieval-augmented generation - Wikipedi...   0.2959433762944227
7                  PDFs/Search engine - Wikipedia.pdf  0.28413981862948656
8                         PDFs/Animal - Wikipedia.pdf  0.24081434495642673
9                            PDFs/Dog - Wikipedia.pdf  0.23216899066468424
10               PDFs/A

Enter Prompt (type 'q' to quit, type 'n' for new conversation):  what is langchain?



what is langchain?
Tool Calls:
  retrieve_context (call_ULzbw1Kehyi1q9y3tE6aYYJB)
 Call ID: call_ULzbw1Kehyi1q9y3tE6aYYJB
  Args:
    query: langchain
Calling from cache...
Name: retrieve_context

Source: {'producer': 'cairo 1.18.0 (https://cairographics.org)', 'creator': 'Mozilla Firefox 146.0.1', 'creationdate': '2026-01-09T10:52:21+05:30', 'source': 'PDFs/LangChain - Wikipedia.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'start_index': 0, '_id': '5fb91b7b39574b39b83de9ff99f179cd', '_collection_name': 'test'}
Content: LangChain
DeveloperHarrison Chase
Initial releaseOctober 2022
Stable release0.1.16 [1] / 11 April 2024
Repository github.com/langchain-ai/
langchain (https://githu
b.com/langchain-ai/lang
chain)
Written in Python and JavaScript
Type Software framework for
large language model
application development
License MIT License
Website LangChain.com (https://l
angchain.com/)
LangChain
Free and open-
source software
portal
LangChain is  a  software  framework  that  hel

Enter Prompt (type 'q' to quit, type 'n' for new conversation):  q


Exiting PDF analyst.
