# Import Required Packages

In [61]:
import os
import getpass
import PyPDF2
import certifi
from typing import List, Dict
from dotenv import load_dotenv
from uuid import uuid4

# Decorator packages
from IPython.display import display, Markdown
from tqdm import tqdm

# langchain core packages
from langchain_core.documents import Document
from langchain_core.prompts import (PromptTemplate, 
                                    ChatPromptTemplate)


# langchain Text 
from langchain.text_splitter import RecursiveCharacterTextSplitter

# langchain
from langchain.chat_models import init_chat_model
from langchain.load import loads, dumps 

# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

# Vector Database 
from pymongo import MongoClient
from pymongo.server_api import ServerApi
from langchain_mongodb import MongoDBAtlasVectorSearch

In [72]:
[str(uuid4()) for i in range(10)]

['92c0f325-41e2-486f-a36d-38083e97c16c',
 '49393096-5d78-42ce-9eb1-8c47bede667e',
 '455d7f85-704e-4034-8c8a-95db2ca17a7b',
 'a558e1f5-eb75-4e4f-af12-b52b053ffd33',
 'a6a4be8b-4b09-4ee1-ac78-e3155ed52e03',
 'e1542944-4893-429c-b4fe-7be2cdd03dfe',
 '3ffba627-1f88-4321-9c32-e9938aeaddff',
 'b11da869-e90d-4c05-9835-407b5022b57a',
 '14a0b23c-a2af-4615-a6be-96f029d2056d',
 '85201f23-2719-44c8-80ca-ec6c0101b73a']

# Load Data 

In [87]:
book_pdf_filepath = "../data/Harry Potter - Book 1 - The Sorcerers Stone.pdf"

with open(book_pdf_filepath, "rb") as pdf_file:
    data = PyPDF2.PdfReader(pdf_file)
    full_text = " ".join([page.extract_text() for page in data.pages])

full_text = full_text.replace("\t", " ")
full_text_doc = Document(
    page_content = full_text, 
    metadata = {"source":"github", "topic":"Harrypotter - Book", "chapter":"all_topics"}
)

In [80]:
try:
    with open("../data/ingestion_processed/serialized_chapter.json", "r") as f:
        serialized_chapter = f.read()
    chapters = loads(serialized_chapter)
    print("==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTERS ====")        
except:
    print("==== FAILURE LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTERS ====")



try:
    with open("../data/ingestion_processed/serialized_chapter_summaries.json", "r") as f:
        serialized_chapter_summaries = f.read()
    chapter_summaries = loads(serialized_chapter_summaries)
    print("==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTER SUMMARIES ====")        
except:
    print("==== FAILURE LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTER SUMMARIES ====")



try:
    with open("../data/ingestion_processed/serialized_quotes_documents.json", "r") as f:
        serialized_quotes_documents = f.read()
    quotes_documents = loads(serialized_quotes_documents)
    print("==== SUCCESS : LANGCHAIN SERIALIZED OBJECT -> QUOTE DOCUMENTS ====")        
except:
    print("==== FAILURE: LANGCHAIN SERIALIZED OBJECT -> QUOTE DOCUMENTS ====")

==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTERS ====
==== SUCCESS LOAD: LANGCHAIN SERIALIZED OBJECT -> CHAPTER SUMMARIES ====
==== SUCCESS : LANGCHAIN SERIALIZED OBJECT -> QUOTE DOCUMENTS ====


# 3. Get Retriever

#### Initialize MongoDB Vector Database

In [119]:
MONGODB_ATLAS_CLUSTER_URI = getpass.getpass("MongoDB Atlas Cluster URI:")

In [120]:
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI, 
                     server_api=ServerApi('1'), 
                     tlsCAFile=certifi.where())
DB_NAME = "harry_potter_db"
DB_COLLECTION_NAME = "harry_potter_collection"
DB_SEARCH_INDEX_NAME = "langchain-test-index-vectorstore_1"
MONGODB_COLLECTION = client[DB_NAME][DB_COLLECTION_NAME]
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


#### 1. Creating Retriever for Chapters

In [143]:
class GenerateRetriever:

    """
        Function : Generate Retreiver 

        Params :
            chunk_size : RecursiveTextSplitter parameter
            chunk_overlap : RecursiveTextSplitter parameter
            db_name : MongoDB DB Name
            collection_name : MongoDB Collection Name

    """

    def __init__(self, chunk_size: int, chunk_overlap: int, db_name: str, db_search_index_name:str, 
                collection_name:str, model_name:str, documents: Document, search_type:str, top_k_documents:int):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.db_name = db_name
        self.collection_name = collection_name 
        self.model_name = model_name
        self.db_search_index_name =  db_search_index_name
        self.search_type = search_type 
        self.top_k_documents = top_k_documents
        self.documents = documents

    def getMongoDBclient(self, embeddings):
        # Collection Name and Database Name
        MONGODB_COLLECTION = client[self.db_name][self.collection_name]
        try:
            # Store embedding in the vector store
            self.vector_store = MongoDBAtlasVectorSearch(
                collection = MONGODB_COLLECTION, 
                embedding = embeddings, 
                index_name = "_id",
                relevance_score_fn = "cosine",
            )
            print("===== Success : Store Embeddings in Vector Store =====")
        except Exception as e:
            print("===== Failure : Store Embeddings in Vector Store =====", e)

        return self.vector_store 

    def getRecursiveCharacterTextSplitter(self):
        # Define the text splitter 
        try:
            self.text_splitter = RecursiveCharacterTextSplitter(
                chunk_size = self.chunk_size, 
                chunk_overlap = self.chunk_overlap, 
                length_function = len
            )
            print("===== Success : Load Recursive Character Text Splitter =====")
        except Exception as e:
            print("===== Failure : Load Recursive Character Text Splitter =====", e)

    
    def getEmbeddingsModel(self):
        try:
            model_kwargs = {"device":"mps"}
            self.embedding_model = HuggingFaceEmbeddings(model_name = self.model_name,
                                             model_kwargs = model_kwargs)
            self.embedding_model.embed_query(text = "Are you working perfectly fine?")
            print("==== Success : Initiate Embedding Model =====")    
        except Exception as e:
            print("===== Failure : Initiate Embedding Model=====", e)

        
    def generateEmbeddings(self):

        # load embedding model
        self.getEmbeddingsModel()

        # load recursive character text splitter 
        self.getRecursiveCharacterTextSplitter()

        # Split the documents 
        documents = self.text_splitter.split_documents(self.documents)

        # Generate Embeddings and Save the documents in MongoDB Vector Database
        self.vector_store = self.getMongoDBclient(embeddings=self.embedding_model)
        uuids = [str(uuid4()) for _ in range(len(documents))]
        self.vector_store.add_documents(documents=documents, ids = uuids)

        # Convert Vector Store as Retriever
        retriever = self.vector_store.as_retriever(search_type = self.search_type, 
                                                   search_kwargs = {"k": self.top_k_documents})

        return retriever

In [146]:
chapter_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "", 
    collection_name = "HarryPotterCollection_Chapter", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= chapters
)
chapters_retriever = chapter_retriever_model.generateEmbeddings()

==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====
===== Success : Store Embeddings in Vector Store =====


In [149]:
chapters_retriever.invoke(input = "Chapter One")

[]

#### 2. Creating Retriever for Chapter Summaries

In [None]:
chapter_summaries_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "HarryPotterSearch_ChapterSummariesIndex", 
    collection_name = "HarryPotterCollection_ChapterSummaries", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= chapter_summaries
)
chapter_summaries_retriever = chapter_summaries_retriever_model.generateEmbeddings()

==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====


#### 3. Creating Retriever for Quotes

In [83]:
quotes_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "HarryPotterSearch_QuotesIndex", 
    collection_name = "HarryPotterCollection_Quotes", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= quotes_documents
)
quotes_retriever = quotes_retriever_model.generateEmbeddings()

==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====


#### 4. Creating Retriever for Full Text

In [89]:
fulltext_retriever_model = GenerateRetriever(
    chunk_size = 1000, 
    chunk_overlap = 100, 
    db_name = "HarryPotter_db", 
    db_search_index_name = "HarryPotterSearch_FullTextIndex", 
    collection_name = "HarryPotterCollection_FullText", 
    model_name = "all-MiniLM-L6-v2", 
    search_type="similarity", 
    top_k_documents = 5,
    documents= [full_text_doc]
)
fulltext_retriever = fulltext_retriever_model.generateEmbeddings()

==== Success : Initiate Embedding Model =====
===== Success : Load Recursive Character Text Splitter =====


In [91]:
fulltext_retriever.invoke("Chapter One")

[]

In [102]:
query = 'Little tyke," chortled Mr. Dursley as he left the house.'

In [104]:
fulltext_retriever.get_relevant_documents(query)

[]