In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [4]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("../documents/")
docs = loader.load()

In [5]:
len(docs[0].page_content)

103

In [6]:
print(docs[2].page_content[:5000])

CONTENTS 
PART-A 
 Page No.  
 Introduction 1 
 Achievements since 2014: Leaving no one behind 2 
 Vision for Amrit Kaal  – an empowered and inclusive economy 3 
 Priorities of this Budget 5 
i. Inclusive Development  
ii. Reaching the Last Mile 
iii. Infrastructure and Investment 
iv. Unleashing the Potential 
v. Green Growth 
vi. Youth Power  
vii. Financial Sector  
 
 
 
 
 
 
 
 
 Fiscal Management 24 
PART B  
  
Indirect Taxes  27 
 Green Mobility  
 Electronics   
 Electrical   
 Chemicals and Petrochemicals   
 Marine products  
 Lab Grown Diamonds  
 Precious Metals  
 Metals  
 Compounded Rubber  
 Cigarettes  
  
Direct Taxes  30 
 MSMEs and Professionals   
 Cooperation  
 Start-Ups  
 Appeals  
 Better targeting of tax concessions  
 Rationalisation  
 Others  
 Personal Income Tax  
  
Annexures  35 
 Annexure to Part B of the Budget Speech 2023-24 
i. Amendments relating to Direct Taxes 
ii. Amendments relating to Indirect Taxes  
 


In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [8]:
splits = text_splitter.split_documents(docs)

In [9]:
len(splits)

122

In [10]:
len(splits[0].page_content)

103

In [11]:
splits[10].metadata

{'source': '..\\documents\\Budget_Speech.pdf', 'page': 7}

In [12]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [13]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

In [14]:
retrieved_docs = retriever.invoke("What is Amrit kaal in Budget 2024?")

In [15]:
len(retrieved_docs)

6

In [16]:
print(retrieved_docs[0].page_content)

Budget 2023-2024 
 
Speech of  
Nirmala Sitharaman 
Minister of Finance 
February 1, 2023 
Hon’ble Speaker,  
 I present the Budget for 2023-24. This is the first Budget in Amrit 
Kaal . 
Introduction 
1. This Budget hopes to build on the foundation laid in the previous 
Budget, and the blueprint drawn for India@100. We envision a prosperous 
and inclusive India, in which the fruits of development reach all regions and 
citizens, especially our youth, women, farmers, OBCs, Scheduled Castes and 
Scheduled Tribes.  
2. In the 75th year of our Independence, the world has recognised the 
Indian economy as a ‘bright star’. Our current year’s economic growth is 
estimated to be at 7 per cent. It is notable that this is the highest among all 
the major economies. This is in spite of the massive slowdown globally 
caused by Covid-19 and a war. The Indian economy is therefore on the right 
track, and despite a time of challenges, heading towards a bright future.


In [17]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

In [None]:
#Prompt from prompt hub

#prompt = hub.pull("rlm/rag-prompt")

In [18]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [19]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [20]:
for chunk in rag_chain.stream("What is Amrit kaal in Budget 2024?"):
    print(chunk, end="", flush=True)

Amrit Kaal in Budget 2024 refers to a vision for an empowered and inclusive economy with a focus on technology-driven and knowledge-based growth. This includes strong public finances, a robust financial sector, and initiatives like Jan Bhagidari through Sabka Saath Sabka Prayas. Thanks for asking!

In [None]:
vectorstore.delete_collection()