In [19]:
import os
from dotenv import load_dotenv
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer, util
from pinecone import Pinecone, ServerlessSpec

load_dotenv()

# Load environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)
env_spec = ServerlessSpec(cloud="aws", region="us-east-1")

# Create index if not exists with embedding dimension matching model (384 for all-MiniLM-L6-v2)
if not pc.has_index(PINECONE_INDEX_NAME):
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=384,
        metric="cosine",
        spec=env_spec,
    )
index = pc.Index(PINECONE_INDEX_NAME)
print(index)


<pinecone.db_data.index.Index object at 0x00000150886F0850>


In [None]:
# Main workflow
# Extract text from PDF
# Extract text from PDF
import os
from langchain_experimental.text_splitter import SemanticChunker
from langchain_google_genai import GoogleGenerativeAIEmbeddings


GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"] 

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

def semantic_chunking(text):
    text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")

    chunks = text_splitter.create_documents([text])

    print("The chunks are: ",chunks)
    return chunks

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text += page_text + " "
    
    # Clean up the text: remove excessive whitespace and normalize
    import re
    # Replace multiple newlines with single space
    full_text = re.sub(r'\n+', ' ', full_text)
    # Replace multiple spaces with single space
    full_text = re.sub(r'\s+', ' ', full_text)
    # Strip leading/trailing whitespace
    full_text = full_text.strip()
    
    return full_text

pdf_path = "documents/Story1.pdf"  # Replace with your PDF path
text = extract_text_from_pdf(pdf_path)

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Fix: Extract text content from Document objects
chunks = semantic_chunking(text)
chunk_texts = [chunk.page_content for chunk in chunks]  # Extract text from Document objects

# Generate embeddings for all chunks
embeddings = model.encode(chunk_texts, convert_to_numpy=True)

The chunks are:  [Document(metadata={}, page_content='In a quaint, cobblestone village nestled beside the Whispering Woods, lived a young cartographer named Elara. Her days were spent meticulously illustrating maps, each line and curve a testament to her dedication. But Elara\'s maps weren\'t merely geographical representations; they were imbued with a subtle magic, a faint shimmer that hinted at the hidden stories within the land. One blustery autumn evening, a lone traveler, cloaked and weary, stumbled into Elara’s shop. He introduced himself as Kael, a historian on a lifelong quest for the lost "Chronicle of Aethel." This ancient tome, rumored to hold the forgotten history of the world, was believed to be hidden within the treacherous peaks of the Dragon\'s Tooth mountains. Kael, having exhausted all conventional methods, had heard whispers of Elara\'s unique abilities and sought her help. Intrigued by the legend and the adventurer\'s earnest plea, Elara agreed. She began poring ove

In [21]:
# CORRECTED: Prepare data for Pinecone upsert with proper dictionary format
pinecone_vectors = []
for i, (embedding, chunk) in enumerate(zip(embeddings, chunks)):
    vector_data = {
        "id": f"chunk_{i}",
        "values": embedding.tolist(),  # Convert numpy array to list
        "metadata": {
            "chunk_text": chunk.page_content,  # Use .page_content to get text
            "chunk_id": i,
            "source": "Story1.pdf"
        }
    }
    pinecone_vectors.append(vector_data)

In [22]:
# Upsert to Pinecone
index.upsert(vectors=pinecone_vectors)

print(f"Upserted {len(pinecone_vectors)} chunks to Pinecone index '{PINECONE_INDEX_NAME}'")

Upserted 3 chunks to Pinecone index 'pinecone-demo'


In [None]:
#"What is the story about Elara and the maps?"
query_text = input()

# 2. Generate embedding for the query using the same model
query_embedding = model.encode(query_text, convert_to_numpy=True)

results = index.query(
    vector=query_embedding.tolist(),
    top_k=20,
    include_metadata=True,
    score_threshold=0.7  # Only return results above this similarity score
)

from langchain.schema import HumanMessage, SystemMessage
from langchain_google_genai import ChatGoogleGenerativeAI

system_prompt = f"""You are a helpful assistant. Use the following context to answer the user's question.
    If the context doesn't contain enough information to answer the question, say so.
    
    Context:
    {results}
"""

messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=query_text)
    ]

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.7,
    google_api_key=GOOGLE_API_KEY# Set this in your environment
)


response = llm.invoke(messages)

print("question: ", query_text)
print("**"*20)
print("context: ", results["matches"][0]["metadata"]["chunk_text"])
print("**"*20)
print("response: ", response.content)





question:  What clue did the first riddle provide, and how did Elara interpret it?
****************************************
context:  In a quaint, cobblestone village nestled beside the Whispering Woods, lived a young cartographer named Elara. Her days were spent meticulously illustrating maps, each line and curve a testament to her dedication. But Elara's maps weren't merely geographical representations; they were imbued with a subtle magic, a faint shimmer that hinted at the hidden stories within the land. One blustery autumn evening, a lone traveler, cloaked and weary, stumbled into Elara’s shop. He introduced himself as Kael, a historian on a lifelong quest for the lost "Chronicle of Aethel." This ancient tome, rumored to hold the forgotten history of the world, was believed to be hidden within the treacherous peaks of the Dragon's Tooth mountains. Kael, having exhausted all conventional methods, had heard whispers of Elara's unique abilities and sought her help. Intrigued by the l