In [23]:
# Import necessary libraries
import os
import time
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

In [24]:
# Load environment variables
load_dotenv()

# Configure with explicit API key - don't rely on environment variables
PINECONE_API_KEY = "pcsk_RwHoN_ArFYtHrRoYrHgmHchWofZJShCjkigHbAQpicr5Xwd4GJdqW9PGyxbkVuUaVjZu4"
INDEX_NAME = "langchainvector"
MODEL_NAME = "all-MiniLM-L6-v2"

In [25]:
# Function to read PDF documents
def read_documents(directory):
    print(f"Loading documents from {directory}...")
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    print(f"Loaded {len(documents)} documents")
    return documents

In [26]:
# Function to chunk documents
def chunk_documents(documents, chunk_size=800, chunk_overlap=50):
    print("Chunking documents...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks")
    return chunks

In [27]:
# Initialize Pinecone
def setup_pinecone():
    print(f"Initializing Pinecone with API key: {PINECONE_API_KEY[:5]}...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # Check if index exists
    indexes = pc.list_indexes().names()
    
    if INDEX_NAME in indexes:
        print(f"Deleting existing index '{INDEX_NAME}'...")
        pc.delete_index(INDEX_NAME)
        time.sleep(10)  # Wait longer for deletion to complete
    
    print(f"Creating index '{INDEX_NAME}' with dimension 384...")
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,  # Dimension for all-MiniLM-L6-v2
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    
    # Wait for index to initialize
    print("Waiting for index to initialize...")
    time.sleep(10)
    
    return pc

In [29]:
# Upload documents to Pinecone using direct method
def upload_to_pinecone(chunks, embedding_model):
    print("Preparing to upload documents to Pinecone...")
    
    # Initialize Pinecone
    pc = setup_pinecone()
    index = pc.Index(INDEX_NAME)
    
    # Generate embeddings for chunks
    print("Generating embeddings for chunks...")
    vectors_to_upsert = []
    
    for i, chunk in enumerate(chunks):
        # Get the embedding for the chunk
        try:
            embedding = embedding_model.embed_documents([chunk.page_content])[0]
            
            # Create a vector with metadata
            vector = {
                "id": f"doc_{i}",
                "values": embedding,
                "metadata": {
                    "text": chunk.page_content,
                    "source": chunk.metadata.get("source", "unknown")
                }
            }
            
            vectors_to_upsert.append(vector)
            
            # Print progress
            if (i + 1) % 20 == 0 or i == len(chunks) - 1:
                print(f"Processed {i + 1}/{len(chunks)} chunks")
                
        except Exception as e:
            print(f"Error embedding chunk {i}: {e}")
         # Upload vectors in batches
    batch_size = 100
    for i in range(0, len(vectors_to_upsert), batch_size):
        batch = vectors_to_upsert[i:i + batch_size]
        try:
            index.upsert(vectors=batch)
            print(f"Uploaded batch {i//batch_size + 1}/{(len(vectors_to_upsert)-1)//batch_size + 1}")
        except Exception as e:
            print(f"Error uploading batch {i//batch_size + 1}: {e}")
    
    # Verify upload
    stats = index.describe_index_stats()
    print(f"Index stats after upload: {stats}")
    print(f"Total vectors in index: {stats.get('total_vector_count', 'unknown')}")
    
    return index

In [30]:
# Query function
def query_pinecone(index, embedding_model, query_text, top_k=3):
    print(f"Querying Pinecone with: '{query_text}'")
    
    # Generate query embedding
    query_embedding = embedding_model.embed_query(query_text)
    
    # Query Pinecone
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    return results

In [31]:
# Main execution
if __name__ == "__main__":
    # Step 1: Load documents
    documents = read_documents('documents/')
    
    # Step 2: Chunk documents
    chunks = chunk_documents(documents)
    
    # Step 3: Initialize embedding model
    print(f"Initializing embedding model: {MODEL_NAME}")
    embedding_model = HuggingFaceEmbeddings(model_name=MODEL_NAME)
    
    # Step 4: Upload to Pinecone
    index = upload_to_pinecone(chunks, embedding_model)
    
    # Step 5: Query
    query_text = "what budget is being talked about?"
    results = query_pinecone(index, embedding_model, query_text)
    
    # Display results
    print("\nQuery Results:")
    if results and results.get('matches'):
        for i, match in enumerate(results['matches']):
            print(f"\nResult {i+1} (Score: {match['score']:.4f}):")
            print(f"Content: {match['metadata']['text'][:150]}...")
            print(f"Source: {match['metadata'].get('source', 'Unknown')}")
    else:
        print("No results found for the query.")

Loading documents from documents/...
Loaded 58 documents
Chunking documents...
Created 140 chunks
Initializing embedding model: all-MiniLM-L6-v2
Preparing to upload documents to Pinecone...
Initializing Pinecone with API key: pcsk_...
Deleting existing index 'langchainvector'...
Creating index 'langchainvector' with dimension 384...
Waiting for index to initialize...
Generating embeddings for chunks...
Processed 20/140 chunks
Processed 40/140 chunks
Processed 60/140 chunks
Processed 80/140 chunks
Processed 100/140 chunks
Processed 120/140 chunks
Processed 140/140 chunks
Uploaded batch 1/2
Uploaded batch 2/2
Index stats after upload: {'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}
Total vectors in index: 0
Querying Pinecone with: 'what budget is being talked about?'

Query Results:
No results found for the query.


In [33]:
query_text = "Indirect taxes on?"
results = query_pinecone(index, embedding_model, query_text)

# Display results
print("\nQuery Results:")
if results and results.get('matches'):
    for i, match in enumerate(results['matches']):
        print(f"\nResult {i+1} (Score: {match['score']:.4f}):")
        print(f"Content: {match['metadata']['text'][:150]}...")
        print(f"Source: {match['metadata'].get('source', 'Unknown')}")
else:
    print("No results found for the query.")

Querying Pinecone with: 'Indirect taxes on?'

Query Results:

Result 1 (Score: 0.6875):
Content:  Rationalisation  
 Others  
 Personal Income Tax  
  
Annexures 35 
 Annexure to Part B of the Budget Speech 2023-24 
i. Amendments relating to D...
Source: documents/budget_speech.pdf

Result 2 (Score: 0.5614):
Content: 27 
 
 
 
PART B 
Indirect Taxes 
118. My indirect tax proposals aim to promote exports, boost domestic 
manufacturing, enhance domestic value additio...
Source: documents/budget_speech.pdf

Result 3 (Score: 0.5461):
Content: 35 
 
 
 
Annexure to Part B of the Budget Speech 2023-24 
Amendments relating to Direct Taxes 
A. PROVIDING TAX RELIEF UNDER NEW PERSONAL TAX REGIME ...
Source: documents/budget_speech.pdf
