In [2]:
from langchain_community.document_loaders import UnstructuredURLLoader
urls = ['https://www.livemint.com/economy/budget-2024-key-highlights-live-updates-nirmala-sitharaman-infrastructure-defence-income-tax-modi-budget-23-july-11721654502862.html',
        'https://cleartax.in/s/budget-2024-highlights',
        'https://www.hindustantimes.com/budget',
        'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr']
loader = UnstructuredURLLoader(urls=urls)
data = loader.load()

In [3]:

len(data)

4

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  176


In [None]:
#docs[7]

In [7]:
# Embedding models: https://python.langchain.com/v0.1/docs/integrations/text_embedding/
# Let's load the Hugging Face Embedding class.  sentence_transformers
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

vector = embeddings.embed_query("hello, world!")
vector[:5]
#vector

  embeddings = HuggingFaceEmbeddings()
  embeddings = HuggingFaceEmbeddings()
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


[0.034922655671834946,
 0.018830018118023872,
 -0.017854738980531693,
 0.0001388440141454339,
 0.0740736871957779]

In [8]:
from langchain_chroma import Chroma
vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())

  vectorstore = Chroma.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())


In [9]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("Budget highlights")

In [10]:
len(retrieved_docs)

3

In [11]:

print(retrieved_docs[2].page_content)

➤ Budget 2024 Highlights: The eligibility limit for Mudra loans will be increased to Rs 20 lakh from Rs 10 lakh for those who have successfully repaid loans under the TARUN category. ➤ 2024 Budget Highlights: To help MSMEs unlock working capital by converting trade receivables into cash, the turnover threshold for mandatory onboarding on the TReDS platform will be reduced from Rs 500 crore to Rs 250 crore. This change is expected to include 22 additional CPSEs and 7,000 more companies on the platform. ➤ India Budget 2024 Highlights: New credit guarantee schemes for MSMEs will be introduced, providing a guarantee cover of Rs 100 crore for larger loan amounts. ➤ Union Budget 2024 Highlights: SIDBI plans to open 24 new branches over the next three years to broaden its reach. ➤ Budget 2024 Highlights: Financial support will be provided for establishing 50 multi-product food irradiation units and 100 food quality and safety testing labs. ➤ 2024 Budget Highlights: E-Commerce Export Hubs


In [13]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


#model_id = "meta-llama/Meta-Llama-3-8B"
model_id = "tiiuae/falcon-7b"

text_generation_pipeline = pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, max_new_tokens=400, device=0)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards: 100%|██████████| 2/2 [16:40<00:00, 500.45s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.48it/s]
Device set to use cpu


In [14]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [15]:
question = "2024 Budget Highlights"

In [16]:

rag_chain.invoke(question)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(id=\'b5d8f060-b2d4-4f96-85a0-c777fd84b4bc\', metadata={\'source\': \'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr\'}, page_content="2024 Budget Announcement for Space Industry\\n\\n➤ Budget 2024 Highlights: A venture capital fund of Rs 1,000 crore will be set up. ➤ 2024 India Budget Highlights: Land records in urban areas will be digitized with GIS mapping. An IT based system for property record administration, updating, and tax administration will be established. ➤ Union Budget 2024 Highlights: Sitharaman said that the Budget is focused on poor, women, youth and farmer. ➤ India Budget 2024 Highlights: Global uncertainty poses a downside for growth. Despite these headwinds India\'s growth continue to be higher. ➤ 202

In [17]:

# Making the response readable
response = rag_chain.invoke(question)
response = response.replace("</s>", "").strip()
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: <|system|>
Answer the question based on your knowledge. Use the following context to help:

[Document(id='b5d8f060-b2d4-4f96-85a0-c777fd84b4bc', metadata={'source': 'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr'}, page_content="2024 Budget Announcement for Space Industry\n\n➤ Budget 2024 Highlights: A venture capital fund of Rs 1,000 crore will be set up. ➤ 2024 India Budget Highlights: Land records in urban areas will be digitized with GIS mapping. An IT based system for property record administration, updating, and tax administration will be established. ➤ Union Budget 2024 Highlights: Sitharaman said that the Budget is focused on poor, women, youth and farmer. ➤ India Budget 2024 Highlights: Global uncertainty poses a downside for growth. Despite these headwinds India's growth continue to be higher. ➤ 2024 Uni

In [18]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


model_name = "EleutherAI/gpt-neo-1.3B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [19]:
prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

In [20]:
llm_chain = prompt | llm | StrOutputParser()

In [21]:
from langchain_core.runnables import RunnablePassthrough
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [22]:
question = "What is the Union Budget?"

response = rag_chain.invoke(question)

# Making the response readable
response = response.replace("</s>", "").strip()
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response: <|system|>
Answer the question based on your knowledge. Use the following context to help:

[Document(id='815eaaf6-546d-4b9b-bc92-efdb63144c7a', metadata={'source': 'https://economictimes.indiatimes.com/news/economy/policy/budget-2024-highlights-india-nirmala-sitharaman-capex-fiscal-deficit-tax-slab-key-announcement-in-union-budget-2024-25/articleshow/111942707.cms?from=mdr'}, page_content="Union Budget 2024: Key announcements for different industries\n\nRs 3 -7 lakh 5 per cent\n\nRs 7-10 lakh 10 per cent Rs 10-12 lakh 15 per cent\n\nRs 12-15 lakh 20 per cent Above Rs 15 lakh 30 per cent\n\n➤ 2024 Budget Highlights: The Standard Deduction under the New Tax Regime increased from Rs 30,000 to Rs 75,000, saving Rs 17,500. The family pension deduction for pensioners rose from Rs 15,000 to Rs 25,000.\n\nYou Might Also Like:\n\nYou Might Also Like thumb-111943807\n\nBudget 2o24: What's cheaper and what's costlier? Here's the list\n\nBudget 2024 Announcements for STT, Short-term cap