In [1]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
import pandas as pd

In [4]:
import os
from langchain_openai import AzureChatOpenAI

azure_model = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("OPENAI_API_VERSION"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
)

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

2026-01-12 12:29:31.374591: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from qdrant_client.models import Distance, VectorParams
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")
vector_size = len(embeddings.embed_query("sample text"))

if not client.collection_exists("test"):
    client.create_collection(
        collection_name="test",
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )
vector_store = QdrantVectorStore(
    client=client,
    collection_name="test",
    embedding=embeddings,
)

In [7]:
from langchain_community.document_loaders import PyPDFLoader

all_docs = []
for file in os.listdir("PDFs"):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join("PDFs", file))
        docs = loader.load()
        all_docs.extend(docs)

Multiple definitions in dictionary at byte 0xbbee for key /x1098
Multiple definitions in dictionary at byte 0xbbfe for key /x1099
Multiple definitions in dictionary at byte 0xbc0e for key /x1098
Multiple definitions in dictionary at byte 0xbc1e for key /x1099
Multiple definitions in dictionary at byte 0xbc2e for key /x1098
Multiple definitions in dictionary at byte 0xbc3e for key /x1099


In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

all_splits = text_splitter.split_documents(all_docs)
print(f"Split into {len(all_splits)} sub-documents.")

Split into 2312 sub-documents.


In [9]:
document_ids = vector_store.add_documents(documents=all_splits)
print(document_ids[:3])

['3163909f315e48c0a4022d15c0e2c31a', 'f125d5189ba6414691bbc47720dd497c', 'd39a72413df3417984ab20aa5006341a']


In [10]:
def score_sheet (user_input, to_print=True):
    """
    Must be called alongside user input every time. Returns scores per pdf.
    """
    results = vector_store.similarity_search_with_score(user_input, k=1000)
    
    pdf_best_scores = {}
    for doc, score in results:
        src = doc.metadata["source"]
        pdf_best_scores[src] = max(pdf_best_scores.get(src, 0), score)
    
    files = np.array(list(pdf_best_scores.keys()))
    scores = np.array(list(pdf_best_scores.values()))
    df = pd.DataFrame(np.concatenate((files.reshape(-1,1),scores.reshape(-1,1)), axis=1), columns=['1','2'])

    if to_print:
        print(df)
    return df

In [11]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [12]:
def reranking(user_input, k=3):
    df = score_sheet(user_input, to_print=False)
    df_rerank = df.iloc[:k, 0].values

    reranked_docs = []
    for doc_path in df_rerank:
        loader = PyPDFLoader(doc_path)
        docs = loader.load()
        reranked_docs.extend(docs)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    reranked_splits = text_splitter.split_documents(reranked_docs)

    query_vec = embeddings.embed_query(user_input)
    similarity_values = []
    for chunk in reranked_splits:
        vec = embeddings.embed_query(chunk.page_content)
        similarity_values.append(cosine_similarity(query_vec, vec))

    similarity_values = np.sort(np.array(similarity_values))

    best_docs = []
    best_idx = np.array(similarity_values[:k], dtype=int)
    for idx in best_idx:
        best_docs.append(reranked_splits[idx])
    return best_docs

reranking("What is a dog?")

In [16]:
query = "this is a dog?"
query1 = "is this a dog"
query_embedding = embeddings.embed_query(query)
query_embedding1 = embeddings.embed_query(query1)

cosine_similarity(query_embedding, query_embedding1)

0.9066656164931488

In [17]:
query = "what is langchain"
query1 = "what do you mean by langchain"
query_embedding = embeddings.embed_query(query)
query_embedding1 = embeddings.embed_query(query1)

cosine_similarity(query_embedding, query_embedding1)

0.9385016826056514

In [45]:
from langchain.tools import tool
retrieval_cache = {}

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""

    query_embedding = embeddings.embed_query(query)
    for entry in retrieval_cache:
        query_matching = cosine_similarity(query_embedding, embeddings.embed_query(entry))
        if query_matching > 0.87:
            print("Information found in cache...")
            return retrieval_cache[entry]
    
    retrieved_docs = vector_store.similarity_search(query, k=2)
    df1 = score_sheet(query, to_print=False)

    if float(df1.iloc[0,1]) < 0.4:
        print("Reranking ...")
        retrieved_docs = reranking(query)
        
    serialized = "\n\n".join(
        f"Source: {doc.metadata}\nContent: {doc.page_content}"
        for doc in retrieved_docs
    )

    score_sheet(query)
    
    result = (serialized, retrieved_docs)
    retrieval_cache[query] = result
    return result

In [47]:
from langchain_core.globals import set_llm_cache
from langchain.agents import create_agent
from langgraph.checkpoint.memory import InMemorySaver
from langchain_core.caches import InMemoryCache

set_llm_cache(InMemoryCache())

tools = [retrieve_context]
prompt = (
    "You have access to a tool that retrieves context from PDF Documents."
    "Use the tool to help answer user queries. Mention to the user when you are using cached data."
)

agent = create_agent(azure_model, tools, system_prompt=prompt)

In [50]:
i='1'
while True:
    user_input = input("Enter Prompt (type 'q' to quit, type 'n' for new conversation): ").strip()
    
    if user_input.lower() in ["q", "quit"]:
        print("Exiting PDF analyst.")
        break

    if user_input.lower() in ["n", "new"]:
        print("What can I do for you?")
        i = str(int(i) + 1)
        continue

    steps = []
    
    for event in agent.stream(
    {"messages": [{"role": "user", "content": user_input}]},
    checkpointer=InMemorySaver(),
    stream_mode="values",
    ):
        event["messages"][-1].pretty_print()

Enter Prompt (type 'q' to quit, type 'n' for new conversation):  what is a cat?



what is a cat?
Tool Calls:
  retrieve_context (call_JQjVptJ2Vh5MOxsJDFXwuwDt)
 Call ID: call_JQjVptJ2Vh5MOxsJDFXwuwDt
  Args:
    query: what is a cat
Reranking ...
                                                    1                    2
0                            PDFs/Dog - Wikipedia.pdf   0.3079954348898596
1                         PDFs/Animal - Wikipedia.pdf   0.2896269783008171
2               PDFs/Machine learning - Wikipedia.pdf  0.23762258063574188
3        PDFs/Artificial intelligence - Wikipedia.pdf  0.22478535907707192
4                        PDFs/Chatbot - Wikipedia.pdf  0.18483502718377834
5                  PDFs/Heat transfer - Wikipedia.pdf  0.18462805948811495
6           PDFs/Large language model - Wikipedia.pdf   0.1811874801589316
7            PDFs/Supervised learning - Wikipedia.pdf  0.17601312907030672
8   PDFs/Generative artificial intelligence - Wiki...  0.17231176686946537
9             PDFs/Net-zero emissions - Wikipedia.pdf  0.15726544596524525
10       

Enter Prompt (type 'q' to quit, type 'n' for new conversation):  how can i use langchain to help in sustainability  research?



how can i use langchain to help in sustainability  research?

Using LangChain to support sustainability research can be highly effective because it allows for the integration of large language models (LLMs) with various data sources, automation, and customization. Here are some ways you can leverage LangChain for sustainability research:

1. Literature Review Automation:
   - Automate the summarization of research papers, reports, and articles related to sustainability.
   - Use LangChain to extract key themes, trends, and data from large documents.
2. Data Collection and Integration:
   - Build pipelines to gather data from APIs, websites, and databases relevant to environmental metrics, carbon footprints, or renewable energy statistics.
   - Integrate this data with natural language understanding for analysis.
3. Knowledge Base Creation:
   - Develop an accessible knowledge base with structured and unstructured sustainability data.
   - Enable querying natural language questions abo

Enter Prompt (type 'q' to quit, type 'n' for new conversation):  tell me using ources



tell me using ources
Tool Calls:
  retrieve_context (call_4Z8XBqPcViNL3Vm88t4sHyXk)
 Call ID: call_4Z8XBqPcViNL3Vm88t4sHyXk
  Args:
    query: the sources from which the information in the document was derived
  retrieve_context (call_vk4kfG4Uw7JBneYzb9x6CpcN)
 Call ID: call_vk4kfG4Uw7JBneYzb9x6CpcN
  Args:
    query: the references or sources cited in the document
                                                    1                    2
0   PDFs/Intergovernmental Panel on Climate Change...   0.4359803739871354
1   PDFs/Retrieval-augmented generation - Wikipedi...  0.38541706742051995
2                      PDFs/LangChain - Wikipedia.pdf   0.3267899869799341
3                PDFs/Albert Einstein - Wikipedia.pdf  0.32653487791550284
4                  PDFs/Search engine - Wikipedia.pdf   0.3243081217428323
5              PDFs/Carbon accounting - Wikipedia.pdf    0.319076932721544
6                  PDFs/Heat transfer - Wikipedia.pdf  0.31674080589894615
7                         PDFs/

Enter Prompt (type 'q' to quit, type 'n' for new conversation):  q


Exiting PDF analyst.


In [49]:
retrieval_cache.keys()

dict_keys(['LangChain'])