In [None]:
from dotenv import load_dotenv # For loading environment variables
import os # For accessing environment variables
import openai # For interacting with OpenAI's API
import langchain # LangChain for chaining LLMs with vector stores and other integrations
import tiktoken # For text tokenization and counting, especially with OpenAI embeddings
# For PDF parsing (PyMuPDF)
import fitz  # PyMuPDF is imported as fitz
from pinecone import Pinecone, ServerlessSpec # For Pinecone vector database (newer Pinecone client usage)
## If building a web server or API (optional)
import requests
# import fastapi
# import uvicorn

  from tqdm.autonotebook import tqdm


In [4]:
load_dotenv(dotenv_path='./.env')

True

In [None]:
#checking env 1
load_dotenv(override=True) # Overwrite existing environment variables

True

In [None]:
#checking env 2                                                   
load_dotenv()  # Ensure .env is in this directory
print(os.getcwd())  # Check current directory
print("OPENAI_API_KEY:", os.getenv("OPENAI_API_KEY"))
print("PINECONE_API_KEY:", os.getenv("PINECONE_API_KEY"))
print("PINECONE_ENV:", os.getenv("PINECONE_ENV"))
print("PINECONE_INDEX_NAME:", os.getenv("PINECONE_INDEX_NAME"))
print("PINECONE_PROJECT_NAME:", os.getenv("PINECONE_PROJECT_NAME"))


c:\Users\Pritam\Desktop\ML projects\LLM Q
OPENAI_API_KEY: sk-proj-**************************************
PINECONE_API_KEY: pcsk_2******************************
PINECONE_ENV: us-east-1-aws
PINECONE_INDEX_NAME: my-index
PINECONE_PROJECT_NAME: insightxpc


In [None]:
from pinecone import Pinecone, ServerlessSpec

load_dotenv()

pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),
    project_name=os.getenv("PINECONE_PROJECT_NAME")  # Add if needed
)

indexes = pc.list_indexes().indexes
existing_index_names = [idx.name for idx in indexes]

if 'my-index' not in existing_index_names:
    pc.create_index(
        name='my-index',
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [None]:
#Simple_chunking
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf):
            text += page.get_text()
    return text

# Extract text 
pdf_path = "budget_speech.pdf" 
pdf_text = extract_text_from_pdf(pdf_path)

print(f"Extracted {len(pdf_text)} characters from the PDF.")


Extracted 95832 characters from the PDF.


In [None]:
#Extracting Text and Metadata
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter

def extract_text_with_metadata(pdf_path):
    """Extract text from a PDF along with page-level metadata."""
    chunks = []
    metadata_list = []

    with fitz.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf, start=1):
            page_text = page.get_text()

            # Chunk the page text
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,  # Chunk size in characters
                chunk_overlap=200,  # Overlap for context preservation
                separators=["\n\n", "\n", " "]  # Split at logical boundaries
            )
            page_chunks = text_splitter.split_text(page_text)

            # Adding chunk along with metadata
            for i, chunk in enumerate(page_chunks):
                chunks.append(chunk)
                metadata_list.append({
                    "chunk": i,
                    "source": "budget_speech.pdf",
                    "page": page_num
                })

    return chunks, metadata_list

# Path to PDF
pdf_path = "budget_speech.pdf"

# Extract text chunks and metadata from the target PDF
chunks, metadata_list = extract_text_with_metadata(pdf_path)

print(f"Number of chunks created: {len(chunks)}")
print("Sample metadata for the first chunk:", metadata_list[5])


Number of chunks created: 138
Sample metadata for the first chunk: {'chunk': 1, 'source': 'budget_speech.pdf', 'page': 6}


In [None]:
# checking 4 indexes in pinecone 
print(pc.list_indexes().indexes)

[{
    "name": "my-index",
    "dimension": 1536,
    "metric": "cosine",
    "host": "my-index-dczve5j.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}, {
    "name": "quickstart",
    "dimension": 1536,
    "metric": "cosine",
    "host": "quickstart-dczve5j.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}]


In [None]:
from langchain.embeddings import OpenAIEmbeddings # For OpenAI embedding
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
chunk_embeddings = embeddings.embed_documents(chunks) # Embedding the chunks

  embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))


In [12]:
import os
from pinecone import Pinecone, ServerlessSpec

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Retrieve environment variables
api_key = os.getenv("PINECONE_API_KEY")
environment = os.getenv("PINECONE_ENV") or "us-east-1-aws"
index_name = os.getenv("PINECONE_INDEX_NAME") or "my_index"

# Initialize Pinecone instance
pc = Pinecone(
    api_key=api_key
)


In [13]:
# List existing indexes
existing_indexes = pc.list_indexes().indexes
existing_index_names = [idx.name for idx in existing_indexes]

In [None]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

load_dotenv()

# Print out environment variables to verify
print("PINECONE_API_KEY:", os.getenv("PINECONE_API_KEY"))
print("PINECONE_PROJECT_NAME:", os.getenv("PINECONE_PROJECT_NAME"))

# Updated Pinecone initialization
try:
    pc = Pinecone(
        api_key=os.getenv("PINECONE_API_KEY")
        # Remove project_name if not explicitly required
    )
    
    # Verify index creation or connection
    indexes = pc.list_indexes()
    print("Existing indexes:", indexes)
except Exception as e:
    print(f"Error initializing Pinecone: {e}")

  from tqdm.autonotebook import tqdm


PINECONE_API_KEY: pcsk_2******************************
PINECONE_PROJECT_NAME: insightxpc
Error initializing Pinecone: (401)
Reason: Unauthorized
HTTP response headers: HTTPHeaderDict({'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '5c9e28844c8af896b347e1e2963880cd', 'Date': 'Sat, 07 Dec 2024 12:08:56 GMT', 'Content-Type': 'text/html', 'Server': 'Google Frontend', 'Content-Length': '15', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: Invalid API Key



In [None]:
PINECONE_API_KEY= "*********************"

In [None]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),
    project_name=os.getenv("PINECONE_PROJECT_NAME")
)

# List all indexes to check if "quickstart" exists
print(pc.list_indexes().names())  # If "quickstart" isn't listed, create it.

# Create the index if it doesn't exist
if "quickstart" not in pc.list_indexes().names():
    pc.create_index(
        name="quickstart",
        dimension=1536,  # the dimension of embeddings 
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# accessing the index
index = pc.Index("quickstart")
print("Index 'quickstart' accessed successfully!")


['my-index', 'quickstart']
Index 'quickstart' accessed successfully!


In [None]:
# Upsert chunks to Pinecone index
for i, (chunk, embedding, metadata) in enumerate(zip(chunks, chunk_embeddings, metadata_list)):
    unique_id = f"chunk_{i}"
    try:
        index.upsert(
            vectors=[(unique_id, embedding, {"text": chunk, **metadata})]
        )
    except Exception as e:
        print(f"Error upserting chunk {i}: {e}")

print(f"Upserted {len(chunks)} chunks to Pinecone index")

Upserted 138 chunks to Pinecone index


In [28]:
from langchain.embeddings import OpenAIEmbeddings
import openai

# Initialize embeddings
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

def retrieve_relevant_chunks(query, top_k=3):
    # Generate query embedding
    query_embedding = embeddings.embed_query(query)
    
    # Search Pinecone index
    results = index.query(
        vector=query_embedding, 
        top_k=top_k, 
        include_metadata=True
    )
    
    return results['matches']

def generate_answer(query, relevant_chunks):
    # Combine retrieved chunks into context
    context = "\n\n".join([
        chunk['metadata'].get('text', '') for chunk in relevant_chunks
    ])
    
    # Prepare prompt
    prompt = f"""
    Context: {context}
    
    Question: {query}
    
    Based on the provided context from the budget speech, provide a comprehensive and precise answer.
    If the information is not in the context, state that you cannot find relevant information.
    """
    
    # Generate response using OpenAI
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on given context."},
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message.content

def answer_question(query):
    # Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(query)
    
    # Generate answer
    answer = generate_answer(query, relevant_chunks)
    
    return answer

In [None]:
def main():
    print("Budget Speech Q&A Assistant")
    print("Ask questions about the budget speech. Type 'exit' to quit.")
    
    while True:
        # Get user input
        query = input("\nYour question: ")
        
        # Check if user wants to exit
        if query.lower() == 'exit':
            print("Thank you for using the Budget Speech Assistant!")
            break
        
        try:
            # Get and print the answer
            response = answer_question(query)
            print("\nAnswer:", response)
        
        except Exception as e:
            print(f"Error processing your query: {e}")

# Run the main interaction
if __name__ == "__main__":
    main()

Budget Speech Q&A Assistant
Ask questions about the budget speech. Type 'exit' to quit.
Error processing your query: name 'answer_question' is not defined
