# ***Import Libraries***

In [63]:
import os
import openai
import langchain
import pinecone 
import PyPDF2
import uuid
import json
from typing import List, Dict
from tqdm.autonotebook import tqdm
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from llama_index import StorageContext # Configures the settings and parameters for the OpenAI model.
from llama_index.vector_stores.pinecone import PineconeVectorStore
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer


In [2]:
from dotenv import load_dotenv # To load env variables
load_dotenv()

True

# ***Read the document***

In [3]:
# Reading a PDF file and extracting text from each page.

def read_pdf(file_path: str) -> List[Dict[str, str]]: 
    """
    Returns: List[Dict[str, str]]: A list of dictionaries with 'page' and 'content'.
    """
    documents = []
    
    try:
        # Open the PDF file
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            for page_number, page in enumerate(reader.pages):
                text = page.extract_text()
                
                # Skip empty pages
                if text.strip():
                    documents.append({"page": page_number + 1, "content": text})
    except Exception as e:
        print(f"Error reading PDF: {e}")
    
    return documents


In [4]:
pdf_path = r"D:\Projects\LLM_Project\documents\budget_speech.pdf"
documents = read_pdf(pdf_path)
documents

[{'page': 1,
  'content': 'GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023'},
 {'page': 3,
  'content': 'CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Metals  \n\uf0b7 Metals  \n\uf0b7 Compounded Rubber  \n\uf0b7 Cigarettes  \n  \nDirect Taxes  30 \n\uf0b7 MSMEs and Professionals   \n\uf0b7 Cooperation 

In [5]:
len(documents)

56

# ***Split the document into chunks***

In [6]:
# Preprocess the extracted documents by cleaning and splitting them into chunks.

def preprocess_documents(documents: List[Dict[str, str]]) -> List[Dict[str, str]]: 
    """
    Args:
        documents (List[Dict[str, str]]): List of extracted documents with page and content.
    
    Returns:
        List[Dict[str, str]]: Preprocessed documents.
    """
    processed_docs = []
    chunk_size = 800  # Adjust based on your needs
    
    for doc in documents:
        content = doc['content']
        
        # Clean up content (optional: add specific cleaning rules)
        content = content.replace("\n", " ").strip()
        content = content.replace('\uf0b7', '-')
        
        # Split content into chunks if it's too long
        for i in range(0, len(content), chunk_size):
            chunk = content[i:i + chunk_size]
            processed_docs.append({"page": doc['page'], "chunk": chunk})
    
    return processed_docs

In [7]:
processed_documents = preprocess_documents(documents)
processed_documents

[{'page': 1,
  'chunk': 'GOVERNMENT OF INDIA BUDGET 2023-2024 SPEECH OF NIRMALA SITHARAMAN MINISTER OF FINANCE February 1,  2023'},
 {'page': 3,
  'chunk': 'CONTENTS  PART-A   Page No.   - Introduction 1  - Achievements since 2014: Leaving no one behind 2  - Vision for Amrit Kaal  – an empowered and inclusive economy 3  - Priorities of this Budget 5  i. Inclusive Development   ii. Reaching the Last Mile  iii. Infrastructure and Investment  iv. Unleashing the Potential  v. Green Growth  vi. Youth Power   vii. Financial Sector                   - Fiscal Management 24  PART B      Indirect Taxes  27  - Green Mobility   - Electronics    - Electrical    - Chemicals and Petrochemicals    - Marine products   - Lab Grown Diamonds   - Precious Metals   - Metals   - Compounded Rubber   - Cigarettes      Direct Taxes  30  - MSMEs and Professionals    - Cooperation   - Start-Ups   - Appeals   - Better targeting of tax concessions   - Rationalisation   - O'},
 {'page': 3,
  'chunk': 'thers   - Pers

In [8]:
# Viewing the chunked documents
for doc in processed_documents:
    print(f"Page {doc['page']}:\n{doc['chunk']}\n")

Page 1:
GOVERNMENT OF INDIA BUDGET 2023-2024 SPEECH OF NIRMALA SITHARAMAN MINISTER OF FINANCE February 1,  2023

Page 3:
CONTENTS  PART-A   Page No.   - Introduction 1  - Achievements since 2014: Leaving no one behind 2  - Vision for Amrit Kaal  – an empowered and inclusive economy 3  - Priorities of this Budget 5  i. Inclusive Development   ii. Reaching the Last Mile  iii. Infrastructure and Investment  iv. Unleashing the Potential  v. Green Growth  vi. Youth Power   vii. Financial Sector                   - Fiscal Management 24  PART B      Indirect Taxes  27  - Green Mobility   - Electronics    - Electrical    - Chemicals and Petrochemicals    - Marine products   - Lab Grown Diamonds   - Precious Metals   - Metals   - Compounded Rubber   - Cigarettes      Direct Taxes  30  - MSMEs and Professionals    - Cooperation   - Start-Ups   - Appeals   - Better targeting of tax concessions   - Rationalisation   - O

Page 3:
thers   - Personal Income Tax      Annexures  35  - Annexure to Part 

In [9]:
len(processed_documents)

138

# ***Import embedding model***

In [10]:
import ollama
model_id = "llama3.1"

In [11]:
os.environ['API_KEY'] = "llx-YggwDtzmidC98ktS5EkAVDApcwmiMmjP9i634JsjXHwgJWXB"

# ***Embedding Model***

*Using Ollama as embedding model*

In [62]:
ollama_emb = OllamaEmbeddings(base_url="http://localhost:11434", model=model_id)

ollama_emb

OllamaEmbeddings(base_url='http://localhost:11434', model='llama3.1', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [13]:
vectors = ollama_emb.embed_query("How are you?")
print(vectors)

[-1.4375293254852295, -0.5596907138824463, 1.9388209581375122, 1.57071053981781, -4.120678901672363, 0.32121336460113525, 0.32550865411758423, 2.035217523574829, 1.6733437776565552, -0.21447908878326416, -0.5843884944915771, 0.15997739136219025, 0.969813883304596, 0.6356322765350342, -0.5384480953216553, 2.353585720062256, -2.1874399185180664, 0.5958369374275208, 1.3757953643798828, -0.7161111235618591, -1.0047121047973633, -1.7979882955551147, 1.7328181266784668, 2.826014518737793, -3.2442092895507812, -1.7971749305725098, 1.1500827074050903, -0.8177632689476013, 1.7962141036987305, -2.5549933910369873, 0.5484418869018555, -0.5005476474761963, -1.0504437685012817, 1.403569221496582, 3.346334457397461, -2.123626947402954, 1.0403445959091187, -0.3464624285697937, 4.667552947998047, 0.7628341317176819, 2.0868022441864014, -2.7894418239593506, 2.425028085708618, -0.7125775218009949, 0.7621728777885437, -0.8760381937026978, 3.114467144012451, -0.8059695363044739, 2.737823963165283, -2.3137

In [14]:
len(vectors)

4096

# ***Create Vector database***

In [15]:
pc = Pinecone(api_key="pcsk_wJw8t_FS43XcKyPKnBRHk4YCFv7nJA1XDiaVctvFayLY4mrvu14cEJ48dhSFCbNfFjJrx")
index = pc.Index("langchainvector", host="https://langchainvector-depfgyt.svc.aped-4627-b74a.pinecone.io")

# Use the index in PineconeVectorStore
vector_store = PineconeVectorStore(index)

In [16]:
print(pc.list_indexes())

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 4096,
              'host': 'langchainvector-depfgyt.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'langchainvector',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'},
              'tags': None,
              'vector_type': 'dense'}]}


In [17]:
vector_store

PineconeVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, api_key=None, index_name=None, environment=None, namespace=None, insert_kwargs={}, add_sparse_vector=False, text_key='text', batch_size=100, remove_text_from_metadata=False)

In [18]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [19]:
storage_context

StorageContext(docstore=<llama_index.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x000001DDBDAEB5C0>, index_store=<llama_index.storage.index_store.simple_index_store.SimpleIndexStore object at 0x000001DDBDA73080>, vector_stores={'default': PineconeVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, api_key=None, index_name=None, environment=None, namespace=None, insert_kwargs={}, add_sparse_vector=False, text_key='text', batch_size=100, remove_text_from_metadata=False), 'image': <llama_index.vector_stores.simple.SimpleVectorStore object at 0x000001DDBD8687A0>}, graph_store=<llama_index.graph_stores.simple.SimpleGraphStore object at 0x000001DDBD8691C0>)

# ***Preprocess the chunked documents for embedding and storage***

In [20]:
def preprocess_documents_for_pinecone(documents: List[Dict], save_path: str):
    """
    Preprocessing the documents to handle the document structure, assign unique IDs, 
    and prepare for embeddings. Saves the preprocessed documents to a file.
    """
    processed_docs = []
    for doc in documents:
        # Generate a unique identifier for each chunk
        doc_id = f"page_{doc['page']}_{uuid.uuid4()}"  # Unique ID combining page number and a random UUID
        processed_docs.append({
            "id": doc_id,
            "page": doc["page"],
            "chunk": doc["chunk"],
            "metadata": {"page": doc["page"]}
        })

    # Save the processed documents to the specified path
    with open(save_path, "w") as f:
        json.dump(processed_docs, f)

    print(f"Processed documents saved to {save_path}")
    return processed_docs

In [21]:
save_path = "final_doc.json"

In [22]:
processed_documents_final = preprocess_documents_for_pinecone(processed_documents, save_path) 

Processed documents saved to final_doc.json


In [23]:
# Load the saved preprocessed documents
def load_preprocessed_documents(save_path: str):
    with open(save_path, "r") as f:
        processed_docs = json.load(f)
    print(f"Loaded {len(processed_docs)} preprocessed documents from {save_path}")
    return processed_docs

# Example usage
final_doc = load_preprocessed_documents("final_document.json")

Loaded 138 preprocessed documents from final_document.json


In [24]:
final_doc

[{'id': 'page_1_83d299c3-bc3c-4c78-be4e-22566860f4c3',
  'page': 1,
  'chunk': 'GOVERNMENT OF INDIA BUDGET 2023-2024 SPEECH OF NIRMALA SITHARAMAN MINISTER OF FINANCE February 1,  2023',
  'metadata': {'page': 1}},
 {'id': 'page_3_8b877852-a51d-4eab-87ba-bd82085a40a7',
  'page': 3,
  'chunk': 'CONTENTS  PART-A   Page No.   - Introduction 1  - Achievements since 2014: Leaving no one behind 2  - Vision for Amrit Kaal  – an empowered and inclusive economy 3  - Priorities of this Budget 5  i. Inclusive Development   ii. Reaching the Last Mile  iii. Infrastructure and Investment  iv. Unleashing the Potential  v. Green Growth  vi. Youth Power   vii. Financial Sector                   - Fiscal Management 24  PART B      Indirect Taxes  27  - Green Mobility   - Electronics    - Electrical    - Chemicals and Petrochemicals    - Marine products   - Lab Grown Diamonds   - Precious Metals   - Metals   - Compounded Rubber   - Cigarettes      Direct Taxes  30  - MSMEs and Professionals    - Cooperati

In [25]:
len(final_doc)

138

# ***Embed and Store vectors in Pinecone vector database***

In [26]:
# Generate embeddings and store in Pinecone
for document in final_doc:
    vector_id = document["id"]
    chunk_content = document["chunk"]
    metadata = document.get("metadata", {})

    # Check if the vector already exists in Pinecone
    existing_vector = index.fetch(ids=[vector_id])

    # If the vector exists, skip upsert (no comparison with metadata)
    if vector_id in existing_vector.get("vectors", {}):
        continue  # Skip this document

    # Get embedding for the content
    embedding = ollama_emb.embed_query(chunk_content)  # Adjust based on your embedding model's API

    metadata["chunk"] = chunk_content
    # Upsert the document and its embedding into Pinecone
    index.upsert([(vector_id, embedding, metadata)])

# ***Load FLAN-T5 Model***

In [29]:
# Load the FLAN-T5 model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print("FLAN-T5 model and tokenizer loaded successfully!")

FLAN-T5 model and tokenizer loaded successfully!


# ***Retrieve relevant chunks of the query from pinecone database***

In [43]:
# Load a pre-trained embedding model for query encoding
embedding_model = ollama_emb

# Function to retrieve top-k relevant chunks from Pinecone
def retrieve_chunks_from_pinecone(query, top_k=5):
    
    # Encode the query into a vector
    query_embedding = embedding_model.embed_query(query)
    
    # Query Pinecone for the most similar chunks
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True  # Ensure metadata includes 'chunk' field
    )
    
    # Check if there are results and handle cases with no matches
    if not results["matches"]:
        print("No matches found in the vector database.")
        return []
    
    # Extract chunk texts and their relevance scores
    retrieved_chunks = []
    for match in results["matches"]:
        chunk_text = match["metadata"].get("chunk", "")
        score = match.get("score", 0)  # Optional: Include the similarity score
        retrieved_chunks.append((chunk_text, score))

    # Sort by score (if not already sorted) and return the texts only
    retrieved_chunks = sorted(retrieved_chunks, key=lambda x: x[1], reverse=True)
    return [chunk[0] for chunk in retrieved_chunks]


# ***Generate answer for the query using FLAN-T5 Model***

In [59]:
def generate_answer(query, retrieved_chunks):
    # Combine the query and retrieved chunks into a single input
    context = " ".join(retrieved_chunks)
    input_text = f"question: {query} context: {context}"
    
    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, 
    max_length=512  # Ensure it fits within the model's input size
    )
    
    # Generate the answer
    output = model.generate(
        inputs.input_ids, 
        max_length=150,  # Adjust max_length to balance brevity and detail
        num_beams=5,  # Beam search to generate diverse outputs
        early_stopping=True,  # Stop when a complete answer is formed
        length_penalty=1.0,  # Adjust for brevity (lower < 1, higher > 1 for longer answers)
    )
    
    # Decode the output
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer


In [61]:
query = "What are the key highlights of Budget 2023-2024?"
retrieved_chunks = retrieve_chunks_from_pinecone(query, top_k=5)
if retrieved_chunks:
    answer = generate_answer(query, retrieved_chunks)
    print("Answer:", answer)
else:
    print("No relevant chunks found for the query.")


Answer: Capital investment outlay is being increased steeply for the third year in a row by 33 per cent to  10 lakh crore, which would be 3.3 per cent of GDP. This will be almost three times the outlay in 2019-20.
