In [1]:
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.6 kB)
Downloading faiss_cpu-1.8.0-cp310-cp310-macosx_11_0_arm64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


## The Game plan 


<img src="https://dl.dropboxusercontent.com/s/gxij5593tyzrvsg/Screenshot%202023-04-26%20at%203.06.50%20PM.png" alt="vectorstore">


<img src="https://dl.dropboxusercontent.com/s/v1yfuem0i60bd88/Screenshot%202023-04-26%20at%203.52.12%20PM.png" alt="retreiver chain">


In [12]:
from PyPDF2 import PdfReader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [4]:
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")

In [5]:
pdfreader = PdfReader('budget_speech.pdf')

In [6]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [7]:
raw_text[:1000]

'GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024 \nCONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28 \n  \n  1 \n Interim Budget 2024 -2025  \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1, 2024  \nHon’ble Speaker,  \n I present the Interim Budget for 2024 -25.  \nIntroduction  \n1. The Indian  economy  has witnessed profound positive \ntransformation in the last ten years. The people of India are \nlooking ahead to the future with hop

In [8]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)
texts[0]

'GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024 \nCONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28 \n  \n  1 \n Interim Budget 2024 -2025  \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1, 2024  \nHon’ble Speaker,  \n I present the Interim Budget for 2024 -25.'

In [9]:
len(texts[0])

785

In [10]:
len(texts)

61

In [13]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings(api_key=openai_api_key)

In [14]:
document_search = FAISS.from_texts(texts, embeddings)

In [16]:
document_search

<langchain_community.vectorstores.faiss.FAISS at 0x12954e950>

In [17]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import ChatOpenAI

In [37]:
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.0, model="gpt-4-turbo-2024-04-09")

In [38]:
chain = load_qa_chain(llm=llm, chain_type="stuff")

Retriever docs: https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore/

In [25]:
retriever = document_search.as_retriever()

In [26]:
query = "What is the vision of the budget ?"
docs = retriever.get_relevant_documents(query=query)

In [27]:
docs

[Document(page_content='1 \n Interim Budget 2024 -2025  \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1, 2024  \nHon’ble Speaker,  \n I present the Interim Budget for 2024 -25.  \nIntroduction  \n1. The Indian  economy  has witnessed profound positive \ntransformation in the last ten years. The people of India are \nlooking ahead to the future with hope and optimism.  \n2. With the blessings of the people, when our Government \nunder the visionary and dynamic leadership of Hon’ble Prime \nMinister Shri Narendra Modi assumed office in 2014, the country \nwas facing enormous challenges. With ‘Sabka Saath, Sabka \nVikas’  as its ‘mantra’ , the Government overcame those \nchallenges in right earnest. Structural reforms were undertaken. \nPro-people programmes were formulated and implemented'),
 Document(page_content='resolutions, as the country opens up immense possibilities and \nopportunities” . It is our ‘Kartavya Kaal’.  \n78. Every challenge of the pre-2014  era

In [32]:
chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)["output_text"]

"The vision of the budget presented by Nirmala Sitharaman is to achieve 'Viksit Bharat' through inclusive development, growth, social justice, exemplary governance, and economic management. The budget aims to continue the positive transformation of the Indian economy and create opportunities for the future with hope and optimism."

In [33]:
query = "How much the agriculture target will be increased to and what the focus will be ?"
docs = retriever.get_relevant_documents(query=query)
chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)["output_text"]

"The agriculture target will be increased to double the exports to ₹1 lakh crore and generate 55 lakh employment opportunities in the near future. The focus will be on enhancing aquaculture productivity, boosting farmers' income, reducing post-harvest losses, improving productivity, and increasing incomes in the agriculture sector."

In [40]:
query = "What is 'Lakhpati Didi' ?"
retriever = document_search.as_retriever(search_kwargs={"k": 5})
docs = retriever.get_relevant_documents(query=query)
chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)["output_text"]

"'Lakhpati Didi' refers to women who have become economically empowered and self-reliant, achieving a significant level of financial success through their involvement in Self-Help Groups (SHGs). These women have been able to transform their socio-economic conditions, with nearly one crore women already reaching this status of 'Lakhpati Didi', which indicates their financial achievement. The program aims to expand this success, increasing the target for women achieving this status."

In [41]:
query = "How much the agriculture target will be increased to and what the focus will be ?"
retriever = document_search.as_retriever(search_kwargs={"k": 5})
docs = retriever.get_relevant_documents(query=query)
chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)["output_text"]

'The provided text does not specify the exact target increase for agriculture. However, it mentions that the sector is poised for inclusive, balanced, higher growth and productivity, facilitated by farmer-centric policies, income support, coverage of risks through price and insurance support, promotion of technologies and innovations through startups, and other measures. The focus will be on reducing post-harvest losses, improving productivity and incomes, and promoting private and public investment in post-harvest activities including aggregation, modern storage, efficient supply chains, primary and secondary processing, and marketing and branding.'

In [43]:
from IPython.display import display, Markdown
query = "List down some key points summarising Amrit Kaal."
retriever = document_search.as_retriever(search_kwargs={"k": 5})
docs = retriever.get_relevant_documents(query=query)
response = chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)["output_text"]
display(Markdown(response))

Here are some key points summarizing Amrit Kaal:

1. **Economic Growth and Expansion**: The government is committed to fostering high economic growth and creating conditions for people to realize their aspirations.

2. **Inclusive and Sustainable Development**: Economic policies will be adopted that facilitate inclusive and sustainable development, improving productivity and creating opportunities for all.

3. **Micro, Small and Medium Enterprises (MSME) Support**: Ensuring timely and adequate finances, relevant technologies, and appropriate training for MSMEs to grow and compete globally.

4. **Reform, Perform, and Transform**: The government will undertake next-generation reforms and build consensus with states and stakeholders for effective implementation.

5. **Inclusive Development Programs**: Development programs over the last ten years have targeted every household and individual, emphasizing initiatives like 'housing for all', 'har ghar jal', and universal electricity access.

6. **Demography, Democracy, and Diversity**: Leveraging India's demographic advantage, democratic values, and diverse culture to fulfill the aspirations of every Indian.

7. **Creation of Opportunities**: The government emphasizes that there are immense possibilities and opportunities for growth, with the potential to create even more opportunities as needed.

8. **Vision for Developed India by 2047**: The government aims to realize the dream of a developed India by the year 2047, marking a century of independence.

These points highlight the government's strategic focus and policy direction during the period termed as Amrit Kaal, aiming for comprehensive national development.

In [44]:
query = "Mention the top 10 key points summarising this budget."
retriever = document_search.as_retriever(search_kwargs={"k": 20})
docs = retriever.get_relevant_documents(query=query)
response = chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)["output_text"]
display(Markdown(response))

1. **Economic Growth and Stability**: The budget highlights a decade of economic transformation under the government's leadership, emphasizing macro-economic stability, robust investments, and increased average real income by fifty percent.

2. **Infrastructure Development**: There has been a significant focus on building all forms of infrastructure—physical, digital, and social—in record time, which has been instrumental in economic growth and formalization.

3. **Tax Reforms**: Implementation of Goods and Services Tax (GST) has unified the tax structure across the country, deepening and widening the tax base. Direct tax collections have more than tripled over the last ten years.

4. **Financial Sector Strengthening**: Efforts have been made to make savings, credit, and investments more efficient, which has helped in the economic empowerment of the population.

5. **Social Justice and Inclusion**: Programs like housing, water, electricity, and cooking gas for all have been implemented, along with free ration for 80 crore people, significantly enhancing rural incomes and reducing poverty.

6. **Digital and Technological Advancements**: The budget emphasizes the role of digital public infrastructure as a new factor of production, with significant investments in technology for both civilian and defense purposes.

7. **Fiscal Management**: The budget outlines a path for fiscal consolidation with a target to reduce the fiscal deficit to below 4.5 percent by 2025-26. The revised fiscal deficit for 2024-25 is estimated at 5.1 percent of GDP.

8. **Support for MSMEs**: The government plans to ensure timely and adequate finances, relevant technologies, and appropriate training for MSMEs to help them grow and compete globally.

9. **Tourism and Cultural Promotion**: There is a focus on developing tourism, including spiritual tourism, by enhancing infrastructure and services at iconic tourist centers and branding them globally.

10. **Future Vision and Investments**: The budget sets a vision for 'Viksit Bharat' and outlines investments in next-generation reforms, including a significant focus on sustainable and inclusive growth, with special attention to empowering the youth and leveraging technology for national development.

# Newest way as of 2024:

Ref: https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf/

In [47]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.2.0


In [48]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("impromptu-rh.pdf")
pages = loader.load_and_split()

In [49]:
pages[0]

Document(page_content='Impromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4', metadata={'source': 'impromptu-rh.pdf', 'page': 1})

In [50]:
len(pages)

239

In [51]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

In [52]:
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())
docs = faiss_index.similarity_search("who are the authors of the book?", k=2)
for doc in docs:
    print(str(doc.metadata["page"]) + ":", doc.page_content[:300])

191: 185Public Intellectuals
* * *
Reid: Write an interview between [contemporary Italian 
philosopher] Donatella di Cesare and [contemporary 
Nobel laureate Japanese-born English writer] Kazuo 
Ishiguro, focused on di Cesare’s arguments for the public 
role of philosophy and Ishiguro’s subtle, savage sa
2: Impromptu: AmplIfyIng our HumAnIty tHrougH AI  
by Reid Hoffman with GPT-4
ISBNs:  979-8-9878319-1-5 Trade Paperback  
979-8-9878319-2-2 Hardcover 
979-8-9878319-0-8 Ebook
Copyright 2023 Dallepedia LLC
Published by Dallepedia LLC. All rights reserved. No portion 
of this work may be reproduced in an


In [53]:
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.0, model="gpt-4-turbo-2024-04-09")

In [54]:
chain = load_qa_chain(llm=llm, chain_type="stuff") # we are going to stuff all the docs in at once

In [55]:
query = "Who are the authors of the book ?"
retriever = faiss_index.as_retriever(search_kwargs={"k": 5})
docs = retriever.get_relevant_documents(query=query)
response = chain.invoke({"input_documents": docs, "question": query}, return_only_outputs=True)["output_text"]
display(Markdown(response))

The book "Impromptu: Amplifying our Humanity through AI" is authored by Reid Hoffman with GPT-4.

In [60]:
chain2 = load_qa_chain(llm=llm, chain_type="map_reduce", return_intermediate_steps=True) # Can limit no of tokens
query = "Summairse the book in 5 points."
# retriever = faiss_index.as_retriever(search_kwargs={"k": 20})
# docs = retriever.get_relevant_documents(query=query)
response = chain2.invoke({"input_documents": pages, "question": query}, return_only_outputs=True)["output_text"]
display(Markdown(response))

KeyboardInterrupt: 