In [None]:
import os, time, hashlib
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from dotenv import load_dotenv

load_dotenv()

#map metadata 
PDF_METADATA = {
    "tamilnadu_onestopcenter1.pdf": {
        "country": "India",
        "state": "Tamil Nadu",
        "district": "Chennai",
        "region": "Guindy",
        "service_category": "OneStopCentre",
        "service_name": "OSC_Guindy_1"
    },
    "Data_merged.pdf": {
        "country": "India",
        "state": "Tamil Nadu",
        "district": "Chennai",
        "region": "Teynampet",
        "service_category": "Merged Data",
        "service_name": "MergeData"
    },
    "TamilnaduMergedData.pdf": {
        "country": "India",
        "state": "Tamil Nadu",
        "district": "Chennai",
        "region": "HellowWorld",
        "service_category": "Tamil",
        "service_name": "Tamil"
    }
}

#pdf load 
def load_all_pdfs(directory):
    all_pages = []
    for filename in os.listdir(directory):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            print(f"Loading: {filename}")

            loader = PyPDFLoader(file_path)
            pages = loader.load_and_split()

            # attach metadata specific to this PDF
            pdf_meta = PDF_METADATA.get(filename, {})
            for p in pages:
                p.metadata.update(pdf_meta)
                p.metadata["source_file"] = filename

            all_pages.extend(pages)
    return all_pages


pages = load_all_pdfs("documents/")
print("Total pages loaded:", len(pages))


#chunk pages
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=50
)

chunks = splitter.split_documents(pages)
print("Chunks created:", len(chunks))


#chunk metadata
ingest_date = time.strftime("%Y-%m-%d")

for i, c in enumerate(chunks):
    h = hashlib.sha1(c.page_content[:120].encode()).hexdigest()[:12]
    c.metadata["chunk_id"] = f"{c.metadata['source_file']}_chunk_{i}_{h}"
    c.metadata["ingest_date"] = ingest_date


#upserting
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = PineconeVectorStore.from_documents(
    chunks,
    embedding=embeddings,
    index_name="pinecone-chatbot"
)

for i, chunk in enumerate(chunks):
    print("=" * 80)
    print(f"CHUNK {i}")
    print("- Text:")
    print(chunk.page_content)
    print("- Metadata:")
    print(chunk.metadata)
    print("=" * 80)


print("All PDFs ingested into Pinecone successfully!")


  from .autonotebook import tqdm as notebook_tqdm


Loading: Data_merged.pdf
Loading: onestopcenter.pdf
Loading: TamilnaduMergedData.pdf
Total pages loaded: 451
Chunks created: 1294


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


CHUNK 0
- Text:
SEXUAL ABUSE 
 
What is Sexual Coercion? 
Sexual coercion is when someone is pressured, manipulated, or threatened into saying yes to 
sex. It’s not always violent, but it still counts as abuse. It can happen to anyone—men or 
women, in any kind of relationship, even in marriage. The person may agree out of pressure, 
but often feels uncomfortable or violated afterward. 
 
Examples of Sexual Coercion 
 
 
● When the other partner is constantly asking for sex  
● Saying things like, “I thought you love me…won’t you help me finish” 
● When a partner tells you that it ’s your responsibility to have sex with them 
● If they tell you that they will leave or break up with you if you don ’t have sex with 
them 
● Saying that they will go looking for sex elsewhere if you don ’t consent
- Metadata:
{'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-07-01T04:38:14+00:00', 'source': 'documents/Data_merged.pdf', 'total_pages': 396, 'page': 0, 'page_la

In [3]:
query = "What services are offered in One Stop Centres in Tamil Nadu?"

results = vectorstore.similarity_search(query, k=5)

for r in results:
    print("="*80)
    print("TEXT:", r.page_content[:300])
    print("METADATA:", r.metadata)


TEXT: One  Stop  Centres(OSC)  Contact  Details    1  Bangalore  Urban   One  Stop  Center,  BMTC  building,  2nd  floor,  Majestic,  Bengaluru  urban-560024   Amitha  Athresh  HR  9164069641  Harshitha  BR  9449403616   080-26538977   oscbengaluruurban@karnataka.gov.in  2  Bangalore  Rura l  One  Stop  C
METADATA: {'chunk_id': 'Data_merged.pdf_chunk_77_822a72fc2578', 'country': 'India', 'creationdate': '', 'creator': 'PyPDF', 'district': 'Chennai', 'ingest_date': '2025-12-09', 'moddate': '2025-07-01T04:38:14+00:00', 'page': 33.0, 'page_label': '34', 'producer': 'iLovePDF', 'region': 'Teynampet', 'service_category': 'Merged Data', 'service_name': 'MergeData', 'source': 'documents/Data_merged.pdf', 'source_file': 'Data_merged.pdf', 'state': 'Tamil Nadu', 'total_pages': 396.0}
TEXT: Centre for Action and Rural Education (CARE), 
 
No:6, Kambar Street, Teacher’s Colony, 
 
Erode - 638 011. 
 
7 
 
Kancheepuram 
 
People Awareness Social Welfare Trust, 
 
No:Bharathiar Street, Extension, 


In [5]:
results = vectorstore.similarity_search(
    query,
    k=5,
    filter={"region": "Guindy"}
)
print(results)

[Document(id='5c9fc886-e028-442b-8222-a8fee2be867f', metadata={'chunk_id': 'myfile_chunk_0_c056e697e332', 'country': 'India', 'creationdate': '', 'creator': 'PyPDF', 'district': 'Chennai', 'ingest_date': '2025-12-09', 'page': 0.0, 'page_label': '1', 'producer': 'Skia/PDF m144 Google Docs Renderer', 'region': 'Guindy', 'service_category': 'OneStopCentre', 'service_name': 'OneStopCentreNo1', 'source': 'documents/onestopcenter.pdf', 'source_file': 'myfile.pdf', 'state': 'Tamil Nadu', 'title': 'ONE STOP CENTRE', 'total_pages': 3.0}, page_content='What  is  a  One  Stop  Centre?  One  Stop  Centres  (OSCs)  are  intended  to  support  women  affected  by  violence,  in  private  \nand\n \npublic\n \nspaces,\n \nwithin\n \nthe\n \nfamily,\n \ncommunity\n \nand\n \nat\n \nthe\n \nworkplace.\n \nWomen\n \nfacing\n \nphysical,\n \nsexual,\n \nemotional,\n \npsychological\n \nand\n \neconomic\n \nabuse,\n \nirrespective\n \nof\n \nage,\n \nclass,\n \ncaste,\n \neducation\n \nstatus,\n \nmarital\