In [43]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
load_dotenv()
gemini_api=os.getenv("gemini_api")
openai_api=os.getenv("openai_api")
os.environ['GOOGLE_API_KEY']=gemini_api
from langchain_google_genai.chat_models import ChatGoogleGenerativeAI
from langchain_openai.chat_models import ChatOpenAI
# llm = ChatOpenAI(model="gpt-4o-mini",api_key=openai_api, temperature=0)
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp",   api_key=gemini_api)

from IPython.display import display, Markdown,Image

from typing import List, Union, Optional, TypedDict

In [44]:
from sentence_transformers import SentenceTransformer
import faiss
from typing import List, Union
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Set up a cache directory for joblib
from joblib import Memory
memory = Memory(location="cache_directory", verbose=0)

class DocHandle:
    def __init__(self, text: str, embedding_model_name: str = "all-MiniLM-L6-v2"):
        self.text = text
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.text_chunks = self.text_splitter()
        self.index = self.get_faiss_index()
    @memory.cache
    def text_splitter(self) -> List[str]:
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n"], chunk_size=1000, chunk_overlap=300, length_function=len
        )
        chunks = text_splitter.split_text(self.text)
        return chunks
    @memory.cache
    def get_text_embeddings(self):
        embeddings = self.embedding_model.encode(self.text_chunks)
        return embeddings
    @memory.cache
    def get_faiss_index(self):
        embeddings = self.get_text_embeddings()
        d = embeddings.shape[1]
        index = faiss.IndexFlatL2(d)
        index.add(embeddings)
        return index

    def retrieved_text(self, query: Union[List[str], str]) -> List[str]:
        if isinstance(query, str):
            query = [query]
        query_embed = self.embedding_model.encode(query)
        k = 5  # Number of nearest neighbors
        scores, idx = self.index.search(query_embed, k)
        doc_ids = idx[0][::-1]
        retrieved_text = [self.text_chunks[i] for i in doc_ids]
        return retrieved_text


# Example Usage:
with open("data/book1.txt", "r", encoding="utf-8") as file:
    book = file.read()

doc_handler = DocHandle(book)
query = "What is the main idea of the text?"
results = doc_handler.retrieved_text(query)

for i, result in enumerate(results):
    print(f"Result {i+1}:\n{result}\n")


Result 1:
arrogance, but, on the contrary, a true spirit of fearless, but patient
and candid, inquiry. It is a mighty problem of which he proposes a
solution, and he does no more than propose it: in his Preface declaring
that, to himself at least, his arguments “appear to be of no small
philosophical force, though he is quite ready to weigh carefully and
candidly any answer which may be offered to them.”

We feel grateful to the accomplished Essayist for the storehouse of
authentic facts, and the novel combination of inferences from them, with
which he has presented us; and we are not aware that he has given us
just reason to regret confiding in his correctness or candour. And in
travelling with him through his vast and chequered course, we feel that
we have accompanied not only the philosopher and the divine, but the
gentleman: one who, while manifestly knowing what is due to himself, as
manifestly respects his intelligent reader. In several of his

Result 2:
intimated in our last Num