In [3]:
# -------------------------
# 1. Import Libraries
# -------------------------
import os
import langchain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone as LangChainPinecone
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from pinecone import Pinecone, ServerlessSpec


In [4]:

# -------------------------
# 2. Load Environment Keys
# -------------------------
from dotenv import load_dotenv
load_dotenv()
os.environ["GOOGLE_API_KEY"] = "AIzaSyCH_aOk-es_8ceS-3ZdEgcMZY_FgSvjJAs"   # Or keep in .env
pc = Pinecone(api_key="pcsk_5Um72N_4DWGGx6E8B2PM4g8mVSAZ5NuWzE9b8e15cqHSYbqw8V8t9Rzm8SfPRp2Ff3oGzK")

In [5]:
import os

In [6]:

# -------------------------
# 3. Read PDF Documents
# -------------------------
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [7]:
doc = read_doc("documents/")
len(doc)

58

In [8]:

# -------------------------
# 4. Chunk Documents
# -------------------------
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(docs)

In [9]:
documents = chunk_data(doc)
len(documents)

140

In [10]:
# 5. Gemini Embeddings
# -------------------------
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectors = embeddings.embed_query("How are you?")
print("Embedding length:", len(vectors))  # should be 768

Embedding length: 768


In [11]:
# 6. Pinecone Setup (v3 style)
# -------------------------
index_name = "langchainvector"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,   # Gemini embeddings size
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1") # pick valid region
    )

index = pc.Index(index_name)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,
    index_name=index_name
)



In [35]:
# 8. Retrieve Similar Docs
# # -------------------------
def retrieve_query(query, k=2):
    return vectorstore.similarity_search(query, k=k)




In [36]:
# 9. QA Chain with Gemini
# -------------------------
from langchain.chains.question_answering import load_qa_chain

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff")

In [40]:
# -------------------------
# 10. Ask Question
# -------------------------
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print("Matched docs:", doc_search[:1])  # preview first doc
    response = chain.run(input_documents=doc_search, question=query)
    return response

In [43]:

our_query = "What is Green growth?"
answer = retrieve_answers(our_query)
print("\nGemini Answer:", answer)


Matched docs: [Document(id='1f4dd2b7-816b-4ee1-9ac7-641d677f0172', metadata={'creationdate': '2023-02-01T05:28:04+05:30', 'creator': 'Adobe Acrobat Pro 10.1.16', 'moddate': '2023-02-01T08:28:21+05:30', 'page': 31.0, 'page_label': '32', 'producer': 'Adobe Acrobat Pro 10.1.16', 'source': 'documents\\budget_speech (1).pdf', 'title': '', 'total_pages': 58.0}, page_content='contributing about three-fourths of the global turnover by value. With the \ndepletion in deposits of natural diamonds, the industry is moving towards \nLab Grown Diamonds (LGDs) and it holds huge promise. To seize this')]

Gemini Answer: This document does not contain any information about "Green growth".
