# Import Required Packages

In [2]:
import os
import getpass
import PyPDF2
import certifi
from typing import List, Dict
from dotenv import load_dotenv
from uuid import uuid4

# Decorator packages
from IPython.display import display, Markdown
from tqdm import tqdm

# langchain core packages
from langchain_core.documents import Document
from langchain_core.prompts import (PromptTemplate, 
                                    ChatPromptTemplate)


# langchain Text 
from langchain.text_splitter import RecursiveCharacterTextSplitter

# langchain
from langchain.chat_models import init_chat_model
from langchain.load import loads, dumps 

# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

# Vector Database 
from pymongo import MongoClient
from pymongo.server_api import ServerApi
from langchain_mongodb import MongoDBAtlasVectorSearch

  from .autonotebook import tqdm as notebook_tqdm


# Load Data 

In [3]:
book_pdf_filepath = "../data/Harry Potter - Book 1 - The Sorcerers Stone.pdf"

with open(book_pdf_filepath, "rb") as pdf_file:
    data = PyPDF2.PdfReader(pdf_file)
    full_text = " ".join([page.extract_text() for page in data.pages])

full_text = full_text.replace("\t", " ")
full_text_doc = Document(
    page_content = full_text, 
    metadata = {"source":"github", "topic":"Harrypotter - Book", "chapter":"all_topics"}
)

In [4]:
try:
    with open("../data/ingestion_processed/serialized_chapter.json", "r") as f:
        serialized_chapter = f.read()
    chapters = loads(serialized_chapter)
    print("==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTERS ====")        
except:
    print("==== FAILURE LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTERS ====")


try:
    with open("../data/ingestion_processed/serialized_chapter_summaries.json", "r") as f:
        serialized_chapter_summaries = f.read()
    chapter_summaries = loads(serialized_chapter_summaries)
    print("==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTER SUMMARIES ====")        
except:
    print("==== FAILURE LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTER SUMMARIES ====")



try:
    with open("../data/ingestion_processed/serialized_quotes_documents.json", "r") as f:
        serialized_quotes_documents = f.read()
    quotes_documents = loads(serialized_quotes_documents)
    print("==== SUCCESS : LANGCHAIN SERIALIZED OBJECT -> QUOTE DOCUMENTS ====")        
except:
    print("==== FAILURE: LANGCHAIN SERIALIZED OBJECT -> QUOTE DOCUMENTS ====")

==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTERS ====
==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTER SUMMARIES ====
==== SUCCESS : LANGCHAIN SERIALIZED OBJECT -> QUOTE DOCUMENTS ====


  chapters = loads(serialized_chapter)


# 3. Get Retriever

#### Initialize MongoDB Vector Database

In [5]:
MONGODB_ATLAS_CLUSTER_URI = getpass.getpass("MongoDB Atlas Cluster URI:")

In [6]:
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI, 
                     server_api=ServerApi('1'), 
                     tlsCAFile=certifi.where())
DB_NAME = "harry_potter_db"
DB_COLLECTION_NAME = "harry_potter_collection"
DB_SEARCH_INDEX_NAME = "langchain-test-index-vectorstore_1"
MONGODB_COLLECTION = client[DB_NAME][DB_COLLECTION_NAME]
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


#### 1. Creating Retriever for Chapters

In [7]:
class GenerateRetriever:
    """
        Function : Generate Retreiver 

        Params :
            chunk_size : RecursiveTextSplitter parameter
            chunk_overlap : RecursiveTextSplitter parameter
            db_name : MongoDB DB Name
            collection_name : MongoDB Collection Name

    """

    def __init__(self, chunk_size: int, chunk_overlap: int, db_name: str, db_search_index_name:str, 
                collection_name:str, model_name:str, documents: Document, search_type:str, top_k_documents:int):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.db_name = db_name
        self.collection_name = collection_name 
        self.model_name = model_name
        self.db_search_index_name =  db_search_index_name
        self.search_type = search_type 
        self.top_k_documents = top_k_documents
        self.documents = documents

    def getMongoDBclient(self, embeddings):
        # Collection Name and Database Name
        MONGODB_COLLECTION = client[self.db_name][self.collection_name]
        try:
            # Store embedding in the vector store
            self.vector_store = MongoDBAtlasVectorSearch(
                collection = MONGODB_COLLECTION, 
                embedding = embeddings, 
                index_name = self.db_search_index_name,
                relevance_score_fn = "cosine",
            )
            self.vector_store.create_vector_search_index(dimensions = 384)
            print("===== Success : Store Embeddings in Vector Store =====")
        except Exception as e:
            print("===== Failure : Store Embeddings in Vector Store =====", e)

        return self.vector_store 

    def getRecursiveCharacterTextSplitter(self):
        # Define the text splitter 
        try:
            self.text_splitter = RecursiveCharacterTextSplitter(
                chunk_size = self.chunk_size, 
                chunk_overlap = self.chunk_overlap, 
                length_function = len
            )
            print("===== Success : Load Recursive Character Text Splitter =====")
        except Exception as e:
            print("===== Failure : Load Recursive Character Text Splitter =====", e)

    
    def getEmbeddingsModel(self):
        try:
            model_kwargs = {"device":"mps"}
            self.embedding_model = HuggingFaceEmbeddings(model_name = self.model_name,
                                             model_kwargs = model_kwargs)
            self.embedding_model.embed_query(text = "Are you working perfectly fine?")
            print("==== Success : Initiate Embedding Model =====")    
        except Exception as e:
            print("===== Failure : Initiate Embedding Model=====", e)

        
    def generateEmbeddings(self):

        # load embedding model
        self.getEmbeddingsModel()

        # load recursive character text splitter 
        self.getRecursiveCharacterTextSplitter()

        # Split the documents 
        documents = self.text_splitter.split_documents(self.documents)

        # Generate Embeddings and Save the documents in MongoDB Vector Database
        self.vector_store = self.getMongoDBclient(embeddings=self.embedding_model)
        uuids = [str(uuid4()) for _ in range(len(documents))]
        self.vector_store.add_documents(documents=documents, ids = uuids)

        # Convert Vector Store as Retriever
        retriever = self.vector_store.as_retriever(search_type = self.search_type, 
                                                   search_kwargs = {"k": self.top_k_documents})

        return retriever



In [8]:
chapter_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "chapterIndex_1", 
    collection_name = "HarryPotterCollection_Chapter", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= chapters
)
chapter_retriever = chapter_retriever_model.generateEmbeddings()

  self.embedding_model = HuggingFaceEmbeddings(model_name = self.model_name,


==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====
===== Success : Store Embeddings in Vector Store =====


#### 2. Creating Retriever for Chapter Summaries

In [13]:
chapter_summaries_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "HarryPotterSearch_ChapterSummariesIndex", 
    collection_name = "HarryPotterCollection_ChapterSummaries", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= chapter_summaries
)
chapter_summaries_retriever = chapter_summaries_retriever_model.generateEmbeddings()

==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====
===== Success : Store Embeddings in Vector Store =====


#### 3. Creating Retriever for Quotes

In [16]:
quotes_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "HarryPotterSearch_QuotesIndex", 
    collection_name = "HarryPotterCollection_Quotes", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= quotes_documents
)
quotes_retriever = quotes_retriever_model.generateEmbeddings()

==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====
===== Failure : Store Embeddings in Vector Store ===== The maximum number of FTS indexes has been reached for this instance size., full error: {'ok': 0.0, 'errmsg': 'The maximum number of FTS indexes has been reached for this instance size.', 'code': 20, 'codeName': 'IllegalOperation', '$clusterTime': {'clusterTime': Timestamp(1759122390, 38), 'signature': {'hash': b"\xe7\x828\xbf\x07\x13\xbd\x03\x98\xff\xbcm'\x8a\xd9!\x19\x9bd\xed", 'keyId': 7495132185410666498}}, 'operationTime': Timestamp(1759122390, 38)}


#### 4. Creating Retriever for Full Text

In [19]:
fulltext_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "HarryPotterSearch_FullTextIndex", 
    collection_name = "HarryPotterCollection_FullText", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= [full_text_doc]
)
fulltext_retriever = fulltext_retriever_model.generateEmbeddings()

==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====
===== Success : Store Embeddings in Vector Store =====


#### 4. Input the Query 

In [22]:
query  = 'In the first chapter of "Harry Potter and the Sorcerer\'s Stone," we are introduced to the Dursley family, who live on Privet Drive and consider themselves to be perfectly normal and ordinary.'

In [25]:
display(Markdown(fulltext_retriever.invoke(query)[0].page_content))

HP 1 - Harry Potter and the
Sorcerer's Stone
Harry Potter and the Sorcerer's Stone
 
 
Harry Potter
&
The Sorcerer’s Stone
 
 
by 
J.K. Rowling
 
 
 
 
  HP 1 - Harry Potter and the
Sorcerer's Stone CHAPTER ONE
 
THE BOY WHO LIVED
 
      
M 
r. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last people
you’d expect to be involved in anything strange or mysterious, because they just
didn’t hold with such nonsense.
      Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did have a
very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the
usual amount of neck, which came in very useful as she spent so much of her
time craning over garden fences, spying on the neighbors. The Dursleys had a
small son called Dudley and in their opinion there was no finer boy anywhere.

In [26]:
display(Markdown(chapter_summaries_retriever.invoke(query)[0].page_content))

In the first chapter of "Harry Potter and the Sorcerer's Stone," we are introduced to the Dursley family, who live on Privet Drive and consider themselves to be perfectly normal and ordinary. Mr. Dursley works at a drill company and Mrs. Dursley is preoccupied with spying on their neighbors. They have a son named Dudley whom they dote on. The Dursleys have a deep secret that they fear will be discovered, concerning Mrs. Dursley's sister, Lily Potter, and her family.

One morning, as Mr. Dursley goes about his usual routine, he notices strange occurrences around town, such as people in cloaks and oddly dressed individuals whispering excitedly. He becomes increasingly agitated and worried, especially when he overhears discussions about the Potters. He dismisses these concerns as paranoia and tries to focus on work.

In [27]:
display(Markdown(chapter_retriever.invoke(query)[0].page_content))

CHAPTER	ONE
	
THE	BOY	WHO	LIVED
	
						
M	
r.	and	Mrs.	Dursley,	of	number	four,	Privet	Drive,	were	proud	to	say
that	they	were	perfectly	normal,	thank	you	very	much.	They	were	the	last	people
you’d	expect	to	be	involved	in	anything	strange	or	mysterious,	because	they	just
didn’t	hold	with	such	nonsense.
						Mr.	Dursley	was	the	director	of	a	firm	called	Grunnings,	which	made
drills.	He	was	a	big,	beefy	man	with	hardly	any	neck,	although	he	did	have	a
very	large	mustache.	Mrs.	Dursley	was	thin	and	blonde	and	had	nearly	twice	the
usual	amount	of	neck,	which	came	in	very	useful	as	she	spent	so	much	of	her
time	craning	over	garden	fences,	spying	on	the	neighbors.	The	Dursleys	had	a
small	son	called	Dudley	and	in	their	opinion	there	was	no	finer	boy	anywhere.
						The	Dursleys	had	everything	they	wanted,	but	they	also	had	a	secret,	and
their	greatest	fear	was	that	somebody	would	discover	it.	They	didn’t	think	they
could	bear	it	if	anyone	found	out	about	the	Potters.	Mrs.	Potter	was	Mrs.