In [6]:
import os
from pinecone import Pinecone

custom_metadata = {
    "country": "India",
    "state": "Tamil Nadu",
    "district": "Chennai",
    "region": "Guindy",
    "service_category": "OneStopCentre",
    "service_name": "OneStopCentreNo1",
    "source_file": "myfile.pdf"
}

from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("documents/onestopcenter.pdf")
pages = loader.load_and_split()

#attach metadata to each page
for p in pages:
    p.metadata.update(custom_metadata)

#Chunk
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=50)
chunks = splitter.split_documents(pages)








#Adding chunk level metadata
import time, hashlib

ingest_date = time.strftime("%Y-%m-%d")
for i, c in enumerate(chunks):
    h = hashlib.sha1(c.page_content[:100].encode()).hexdigest()[:12]
    c.metadata["chunk_id"] = f"myfile_chunk_{i}_{h}"
    c.metadata["ingest_date"] = ingest_date

#Upsert to Pinecone
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
import pinecone

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = PineconeVectorStore.from_documents(
    chunks,
    embedding=embeddings,
    index_name="pinecone-chatbot"
)


for i, chunk in enumerate(chunks):
    print("=" * 80)
    print(f"CHUNK {i}")
    print("- Text:")
    print(chunk.page_content)
    print("- Metadata:")
    print(chunk.metadata)
    print("=" * 80)


print("Done!")


CHUNK 0
- Text:
What  is  a  One  Stop  Centre?  One  Stop  Centres  (OSCs)  are  intended  to  support  women  affected  by  violence,  in  private  
and
 
public
 
spaces,
 
within
 
the
 
family,
 
community
 
and
 
at
 
the
 
workplace.
 
Women
 
facing
 
physical,
 
sexual,
 
emotional,
 
psychological
 
and
 
economic
 
abuse,
 
irrespective
 
of
 
age,
 
class,
 
caste,
 
education
 
status,
 
marital
 
status,
 
race
 
and
 
culture
 
will
 
be
 
facilitated
 
with
 
support
 
and
 
redressal.
 
It
 
is
 
integrated
 
with
 
women
 
Help
 
line
 
number
 
181
- Metadata:
{'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'ONE STOP CENTRE', 'source': 'documents/onestopcenter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1', 'country': 'India', 'state': 'Tamil Nadu', 'district': 'Chennai', 'region': 'Guindy', 'service_category': 'OneStopCentre', 'service_name': 'OneStopCentreNo1', 'source_file': 'myfile.pdf', 'chunk_id': 'myfil

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = PineconeVectorStore.from_existing_index(
    index_name="pinecone-chatbot",
    embedding=embeddings
)

retriever = vectorstore.as_retriever(
    search_kwargs={"k": 5}    
)


In [8]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")  


  llm = Ollama(model="llama3")   # or "mistral", "qwen2", etc.


In [13]:
#Retrieval Only Test1
query = "What are the service given in one stop centre?"

results = vectorstore.similarity_search(query, k=3)

for i, doc in enumerate(results):
    print("="*80)
    print(f"RESULT {i+1}")
    print("- TEXT:\n", doc.page_content[:500], "...")
    print("- METADATA:", doc.metadata)


RESULT 1
- TEXT:
 What  is  a  One  Stop  Centre?  One  Stop  Centres  (OSCs)  are  intended  to  support  women  affected  by  violence,  in  private  
and
 
public
 
spaces,
 
within
 
the
 
family,
 
community
 
and
 
at
 
the
 
workplace.
 
Women
 
facing
 
physical,
 
sexual,
 
emotional,
 
psychological
 
and
 
economic
 
abuse,
 
irrespective
 
of
 
age,
 
class,
 
caste,
 
education
 
status,
 
marital
 
status,
 
race
 
and
 
culture
 
will
 
be
 
facilitated
 
with
 
support
 
and
 
redressal.
 
It
 
is ...
- METADATA: {'creationdate': '', 'creator': 'PyPDF', 'page': 0.0, 'page_label': '1', 'producer': 'Skia/PDF m144 Google Docs Renderer', 'source': 'documents/onestopcenter.pdf', 'title': 'ONE STOP CENTRE', 'total_pages': 3.0}
RESULT 2
- TEXT:
 What  is  a  One  Stop  Centre?  One  Stop  Centres  (OSCs)  are  intended  to  support  women  affected  by  violence,  in  private  
and
 
public
 
spaces,
 
within
 
the
 
family,
 
community
 
and
 
at
 
the
 
workplace.
 
Women
 
