In [1]:
from langchain_community.document_loaders import TextLoader,DirectoryLoader
from langchain_community.vectorstores import FAISS,Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.messages import AIMessage,HumanMessage,SystemMessage,BaseMessage,ToolMessage
from langchain_core.prompts import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableMap,RunnableLambda,RunnableParallel,RunnablePassthrough
from langchain.utilities import WikipediaAPIWrapper, ArxivAPIWrapper
from langchain_community.tools.arxiv.tool import ArxivQueryRun
from langchain.tools import WikipediaQueryRun
from langchain.tools.retriever import create_retriever_tool
from langchain_tavily import TavilySearch
from langgraph.graph import END,START,StateGraph
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode
from pydantic import BaseModel,Field
from typing import Literal
from typing_extensions import TypedDict

from dotenv import load_dotenv
load_dotenv()

llm_model = ChatGoogleGenerativeAI(model = 'gemini-2.0-flash',max_retries=2)
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',) ##embedding model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs = DirectoryLoader(
    path = 'policies',
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding':'utf-8'}
).load()
print(f"Number of docs loaded: {len(docs)}")
print(f"Document preview: {docs[2].page_content[:200]}")
print(f"Meta data preview: {docs[2].metadata}")



doc_chunks = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    separators=["\n"," "]
).split_documents(docs)
print("\n")
print(f"Number of chunks created : {len(doc_chunks)}")
print(f"Chunks preview (200 characters) : {doc_chunks[0]}")

### vector store ###
persist_dir = "./chromadb"
vector_store1 = Chroma.from_documents(
    documents=doc_chunks,
    embedding= embedding_model,
    collection_name='vec_store1',
    persist_directory=persist_dir
)

print('\n')
retreiver1 = vector_store1.as_retriever(
    search_type = 'mmr', search_kwargs = {'k':3})
print(f"Number of vectors stored : {vector_store1._collection.count()}")

Number of docs loaded: 3
Document preview: ﻿POLICY CONTRACT: SENTINEL 'SILVER' HEALTH SURAKSHA
UIN: SHA-HLT-SLV-2025-V2 | CATEGORY: RETAIL INDEMNITY
VERSION: 4.2 


PREAMBLE
WHEREAS the Insured named in the Schedule hereto has by a proposal an
Meta data preview: {'source': 'policies\\silver_policy.txt'}


Number of chunks created : 24
Chunks preview (200 characters) : page_content='﻿POLICY CONTRACT: SENTINEL 'GOLD' PRIVILEGE
UIN: SHA-HLT-GLD-2025-V1 | CATEGORY: COMPREHENSIVE
VERSION: 2.1 


SECTION 1: OPERATIVE CLAUSE
The Company undertakes to indemnify the Insured Person against Medically Necessary expenses incurred for In-patient Care, Day Care Treatment, and Domiciliary Hospitalization, subject to the terms and sub-limits herein.


SECTION 2: CORE BENEFITS
2.1. ROOM RENT ELIGIBILITY
    (a) The Policy covers expenses for a "Single Private A/C Room".' metadata={'source': 'policies\\gold_policy.txt'}


Number of vectors stored : 24
