In [150]:
# !pip install faiss-cpu


In [151]:
from langchain.document_loaders import PyMuPDFLoader
# Load the PDF
loader = PyMuPDFLoader("data/RAMAYANA.pdf")
docs = loader.load()

# Each doc now contains `page_content` and `metadata`
for doc in docs:
    print(doc.page_content[:100])  # Preview text
    print(doc.metadata)            # {'source': 'data/ramayana.pdf', 'page': 0}

SRI RAMA JAYAM 
RAMAYANA FOR CHILDREN 
Compiled by  
 
Visalakshi Gopalan 
14-Apr-13 
 
 
 
For chil
{'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04-14T19:39:50-07:00', 'trapped': '', 'modDate': "D:20130414193950-07'00'", 'creationDate': "D:20130414193950-07'00'", 'page': 0}
1 
 
Contents 
 
1 RAMAYANA FOR CHILDREN ...........................................................
{'producer': 'Microsoft® Word 2010', 'creator': 'Microsoft® Word 2010', 'creationdate': '2013-04-14T19:39:50-07:00', 'source': 'data/RAMAYANA.pdf', 'file_path': 'data/RAMAYANA.pdf', 'total_pages': 45, 'format': 'PDF 1.5', 'title': 'RAMAYANA FOR CHILDREN', 'author': 'Sony', 'subject': 'Compiled by', 'keywords': '', 'moddate': '2013-04

In [152]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the documents into smaller chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " "] # we can use custom separators 
)
chunks = splitter.split_documents(docs)

In [153]:
print(f"Total chunks created: {len(chunks)}")
print("------------------------------------------")
for i, chunk in enumerate(chunks[:5]):  # Preview first 5 chunks
    print(f"Chunk {i}: {chunk.page_content[:200]}")  # Print first 200 characters of each chunk
    print("------------------------------------------")

Total chunks created: 356
------------------------------------------
Chunk 0: SRI RAMA JAYAM 
RAMAYANA FOR CHILDREN 
Compiled by  
 
Visalakshi Gopalan 
14-Apr-13 
 
 
 
For children’s reading
------------------------------------------
Chunk 1: 1 
 
Contents 
 
1 RAMAYANA FOR CHILDREN ............................................................................................... 2 
1.1 THE BIRTH OF RAMA ......................................
------------------------------------------
Chunk 2: 1.2 The Valiant Princes ........................................................................................................ 3 
1.3 SITA'S SWAYAMVAR ...............................................
------------------------------------------
Chunk 3: 1.4 KAIKEYI AND HER WISHES ....................................................................................... 7 
1.5 The demons in the forests ....................................................
------------------------------------------
Chunk

In [154]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"}  # ✅ this sets the GPU device
)


In [155]:
import os
from langchain.vectorstores import FAISS

index_path = "Faiss_db"

if os.path.exists(index_path):
    vector_store = FAISS.load_local(index_path, embedding_model, allow_dangerous_deserialization=True)
    print("✅ Loaded existing vector store.")
else:
    vector_store = FAISS.from_documents(chunks, embedding_model)  # Start with empty
    vector_store.save_local(index_path)
    print("✅ Vector store created and persisted.")

✅ Vector store created and persisted.


In [156]:
# !pip install faiss-cpu


In [157]:
query = "Where did Rama go to rescue Sita?"
query_vector = embedding_model.embed_query(query)
retrieved_docs = vector_store.similarity_search_by_vector(query_vector, k=3)


for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Result {i+1} ---\n{doc.page_content[:500]}...")



--- Result 1 ---
was to leave Sita insisted that he go in search of Rama. 
But this did not console Rama in any way. He roamed through the 
forests like a madman, calling out her name so often, talking to 
plants and animals about Sita. All their efforts to trace Sita were in...

--- Result 2 ---
to go in search of Rama. Before he went, he warned Sita to be 
careful about strangers. He also threw a line just outside the door of 
the hurt with his arrow and told her “Respected sister-in-law! Please 
do not across this line on any event. I will soon be back with Rama. 
May God protect you?”...

--- Result 3 ---
Rama live in a forest eating simple food and sleeping on a rough –
straw mat? But Rama would not hear of any such argument and so, 
with a heavy heart, she bid farewell to her dearest son. 
After taking leave of his mother, Rama went to talk to Sita. As soon...


## Updating and Deleting Vectors
FAISS: Supports in-place addition/removal (in RAM)

In [158]:
import hashlib
from sklearn.metrics.pairwise import cosine_similarity

# loading a new document and adding it to the vector store
new_loader = PyMuPDFLoader("data/story_book.pdf")
new_docs = new_loader.load()

def get_hash(text):
    return hashlib.md5(text.strip().encode()).hexdigest()

def auto_add_documents(vectorstore, documents, embedding_model, similarity_threshold=0.99):
    new_docs = []
    for doc in documents:
        content = doc.page_content.strip()
        content_hash = hashlib.md5(content.encode()).hexdigest()
        doc.metadata['hash'] = content_hash  # Optional, for metadata tracking

        # Check similarity with existing
        similar_docs = vectorstore.similarity_search(content, k=1)
        if similar_docs:
            existing_embedding = embedding_model.embed_query(similar_docs[0].page_content)
            new_embedding = embedding_model.embed_query(content)

            from sklearn.metrics.pairwise import cosine_similarity
            sim_score = cosine_similarity([new_embedding], [existing_embedding])[0][0]

            if sim_score >= similarity_threshold:
                print(f"Duplicate detected (similarity: {sim_score:.2f}). Skipping.")
                continue

        new_docs.append(doc)

    if new_docs:
        vectorstore.add_documents(new_docs)
        print(f"{len(new_docs)} new documents added.")
        vector_store.save_local(index_path)
    else:
        print("No new documents to add.")

auto_add_documents(vector_store, new_docs, embedding_model)

16 new documents added.


In [159]:
query = "who is Snow White?"
query_vector = embedding_model.embed_query(query)
retrieved_docs = vector_store.similarity_search_by_vector(query_vector, k=3)

for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Result {i+1} ---\n{doc.page_content[:500]}...")


--- Result 1 ---
18
19
The mirror answered:
“My Queen, you are beautiful, it’s true!
But Snow White is more beautiful than you.”
The Queen was furious! She planned a horrible crime.
She called a huntsman and commanded him:
“Take Snow White deep into the forest… and kill her!”
“But… Your Majesty! The beloved Princess!” he said.
“Kill Snow White! And take out her heart,” the Queen 
commanded again.
Part Two
The huntsman took Snow White deep into the woods.
“Forgive me, my Princess… I have orders from the Queen 
to...

--- Result 2 ---
15
14
Snow White
and
the Seven Dwarfs...

--- Result 3 ---
20
21
A little table was set for dinner: seven little chairs, seven 
little bowls, seven little spoons, seven little knives, seven 
little forks, and seven little mugs.
Snow White was hungry and thirsty. She thought:
“I’ll have a little bread from each bowl. And I’ll drink a 
little wine from each mug.”
Then, feeling so tired, she lay down on one of the seven 
little beds. And she fell asleep. 
In 

In [160]:
import numpy as np

def delete_document_by_content(vectorstore, content_to_delete, embedding_model):
    matches = vectorstore.similarity_search(content_to_delete, k=1)

    if matches:
        match_text = matches[0].page_content.strip()

        # Find docstore id and remove from docstore
        doc_id = None
        for i, (internal_id, doc) in enumerate(vectorstore.docstore._dict.items()):
            if doc.page_content.strip() == match_text:
                doc_id = internal_id
                break

        if doc_id is not None:
            del vectorstore.docstore._dict[doc_id]

            # Remove vector by finding its index
            # Note: this assumes the vectors and docs are in same order
            # Get the embedding of the to-delete content
            to_delete_vector = embedding_model.embed_query(content_to_delete)
            distances, indices = vectorstore.index.search(np.array([to_delete_vector], dtype='float32'), k=1)

            print("Found match")
            print(f"Document to delete: {match_text[:100]}...")
            print(f"Document ID: {doc_id}")
            print(f"Distance: {distances[0][0]}")
            
            vectorstore.index.remove_ids(np.array([indices[0][0]], dtype='int64'))
            # Extract remaining documents
            remaining_docs = list(vectorstore.docstore._dict.values())

            # Recreate the FAISS vector store from scratch
            new_vectorstore = FAISS.from_documents(remaining_docs, embedding_model)

            # Save it
            new_vectorstore.save_local(index_path)
            print("Document deleted.")
            return new_vectorstore
        else:
            print("Matching doc_id not found.")
    else:
        print("No matching document found.")
        return vectorstore


vector_store = delete_document_by_content(vector_store, "Snow White is a princess who lived in a castle.", embedding_model)

Found match
Document to delete: 18
19
The mirror answered:
“My Queen, you are beautiful, it’s true!
But Snow White is more beautiful...
Document ID: 36acb84f-0225-461e-880e-e3f6e5d4d105
Distance: 0.8396064043045044
Document deleted.


In [161]:
query = "who is Snow White?"
query_vector = embedding_model.embed_query(query)
retrieved_docs = vector_store.similarity_search_by_vector(query_vector, k=3)

for i, doc in enumerate(retrieved_docs):
    print(f"\n--- Result {i+1} ---\n{doc.page_content[:500]}...")


--- Result 1 ---
15
14
Snow White
and
the Seven Dwarfs...

--- Result 2 ---
20
21
A little table was set for dinner: seven little chairs, seven 
little bowls, seven little spoons, seven little knives, seven 
little forks, and seven little mugs.
Snow White was hungry and thirsty. She thought:
“I’ll have a little bread from each bowl. And I’ll drink a 
little wine from each mug.”
Then, feeling so tired, she lay down on one of the seven 
little beds. And she fell asleep. 
In the morning, she woke up and saw the seven masters 
of the house looking at her. They were seven dw...

--- Result 3 ---
16
17
Part One
Once upon a time there was a little princess…
She was called Snow White because she was white as 
snow. Her lips were red as blood. And her hair was black 
as night. She was a beautiful girl, sweet and gentle.
She lived in a palace with her father and her stepmother: 
the King and the Queen.
The Queen was a beautiful woman. But she was wicked 
and vain.
Every day, the Queen looked in