In [1]:
#importing the module
from langchain_openai import OpenAIEmbeddings #this class we used to load openai embedding models
from langchain_core.prompts import PromptTemplate,ChatPromptTemplate #this class we used to create dynamic or role based prompt
import os
from langchain_core.runnables import RunnableBranch,RunnableParallel,RunnablePassthrough,RunnableLambda,RunnableSequence
from langchain_openai import ChatOpenAI #this class we used to load openai llm models
from langchain_chroma import Chroma #this class we used to create chroma vector store database
from langchain_community.document_loaders import TextLoader,PDFPlumberLoader,DirectoryLoader
from langchain_core.output_parsers import StrOutputParser,PydanticOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter
from dotenv import load_dotenv
from typing import TypedDict,Literal,Annotated
from pydantic import BaseModel,Field,computed_field


In [2]:
load_dotenv()


#creating an object of embedding models
emb_models = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=128
)


#creating and object llm chatmodel 
chat_model = ChatOpenAI(
    temperature=0 #we called as creative parameter
)


In [3]:
# Custom Loader Class
class CustomTextLoader(TextLoader):
    def lazy_load(self):
        # File name extract
        filename = os.path.basename(self.file_path)   # e.g. "RCB_virat.txt"
        team_name = filename.split("_")[0]            # e.g. "RCB"
        
        # Iterate lazily from parent class
        for doc in super().lazy_load():
            doc.metadata["team"] = team_name
            yield doc

In [4]:
os.getcwd()

'd:\\Langchain\\VectorStore'

In [5]:
os.chdir('../')
%pwd

'd:\\Langchain'

In [6]:
# DirectoryLoader with CustomTextLoader
loader = DirectoryLoader(
    path=r"VectorStore\textDocument",
    loader_cls=CustomTextLoader,
    show_progress=True,
    glob="*.txt"
)

# lazy load documents
documents = list(loader.lazy_load())
print(documents)


100%|██████████| 5/5 [00:00<00:00, 5005.14it/s]

[Document(metadata={'source': 'VectorStore\\textDocument\\CSK_dhoni.txt', 'team': 'CSK'}, page_content='MS Dhoni, famously known as Captain Cool, has led Chennai Super Kings to multiple IPL titles. \nHis finishing skills, wicketkeeping, and leadership are legendary.'), Document(metadata={'source': 'VectorStore\\textDocument\\CSK_jadeja.txt', 'team': 'CSK'}, page_content='Ravindra Jadeja is a dynamic all-rounder who contributes with both bat and ball. \nRepresenting Chennai Super Kings, his quick fielding and match-winning performances make him a key player.'), Document(metadata={'source': 'VectorStore\\textDocument\\MI_bumrah.txt', 'team': 'MI'}, page_content='Jasprit Bumrah is considered one of the best fast bowlers in T20 cricket. \nPlaying for Mumbai Indians, he is known for his yorkers and death-over expertise.'), Document(metadata={'source': 'VectorStore\\textDocument\\MI_rohit.txt', 'team': 'MI'}, page_content="Rohit Sharma is the most successful captain in IPL history, leading M




In [7]:
#now creating object chroma vector store database by using Chroma class.
chroma = Chroma(
    #embedding_function parameter we have to define embedding models who will convert doc to embedding vectors
    embedding_function=emb_models, 
    persist_directory= "chroma_db", #means which loaction u want to store embedding vectors
    collection_name="sample"
    
)



In [8]:
#Now adding the documents to vector database
chroma.add_documents(documents)


['63447478-e734-41c7-bf9f-f314a18b7880',
 '15456d66-b0f7-458d-bb2d-a93040306a54',
 '615e61f7-1687-4e20-ab02-2583dff5e43c',
 '6955a72f-fec7-440e-ae7a-e632fdae9574',
 'ca83dbc1-ea29-4585-8d8f-757b063f534f']

In [9]:

#if i want to show the documents from vector store database
chroma.get(include=["embeddings", "metadatas", "documents"])

{'ids': ['63447478-e734-41c7-bf9f-f314a18b7880',
  '15456d66-b0f7-458d-bb2d-a93040306a54',
  '615e61f7-1687-4e20-ab02-2583dff5e43c',
  '6955a72f-fec7-440e-ae7a-e632fdae9574',
  'ca83dbc1-ea29-4585-8d8f-757b063f534f'],
 'embeddings': array([[ 0.21147315, -0.12693517, -0.04423499,  0.10522371,  0.02577169,
          0.06312568, -0.03609318, -0.04530346, -0.02159394, -0.03085764,
          0.01772605,  0.05141516, -0.01458472, -0.07962298, -0.06551906,
          0.01970273, -0.05372307,  0.15420273, -0.09445345, -0.14676613,
          0.0188159 , -0.07192993, -0.11650683,  0.00033006, -0.03252447,
          0.04466238, -0.0295541 , -0.09146171, -0.05624468, -0.08338401,
          0.05197076,  0.07688767,  0.23916809, -0.12941405, -0.18155576,
          0.07970845, -0.03596497, -0.13727804,  0.06410868, -0.05004751,
          0.04645742, -0.01444582, -0.03357157, -0.09069241, -0.00969644,
          0.04991929, -0.25728947, -0.13009787, -0.05727042,  0.02649825,
          0.15317699, -0.082

In [10]:
#now doing semantic search in vector store database.
chroma.similarity_search(query="who among these are bowler?",k=2)

[Document(id='615e61f7-1687-4e20-ab02-2583dff5e43c', metadata={'source': 'VectorStore\\textDocument\\MI_bumrah.txt', 'team': 'MI'}, page_content='Jasprit Bumrah is considered one of the best fast bowlers in T20 cricket. \nPlaying for Mumbai Indians, he is known for his yorkers and death-over expertise.'),
 Document(id='15456d66-b0f7-458d-bb2d-a93040306a54', metadata={'source': 'VectorStore\\textDocument\\CSK_jadeja.txt', 'team': 'CSK'}, page_content='Ravindra Jadeja is a dynamic all-rounder who contributes with both bat and ball. \nRepresenting Chennai Super Kings, his quick fielding and match-winning performances make him a key player.')]

In [14]:
#now doing semantic search and also want to see scores in vector store database.
chroma.similarity_search_with_score(
    query="who among these are a bowler?",
    k=2
    
)

[(Document(id='615e61f7-1687-4e20-ab02-2583dff5e43c', metadata={'source': 'VectorStore\\textDocument\\MI_bumrah.txt', 'team': 'MI'}, page_content='Jasprit Bumrah is considered one of the best fast bowlers in T20 cricket. \nPlaying for Mumbai Indians, he is known for his yorkers and death-over expertise.'),
  0.8214887976646423),
 (Document(id='15456d66-b0f7-458d-bb2d-a93040306a54', metadata={'source': 'VectorStore\\textDocument\\CSK_jadeja.txt', 'team': 'CSK'}, page_content='Ravindra Jadeja is a dynamic all-rounder who contributes with both bat and ball. \nRepresenting Chennai Super Kings, his quick fielding and match-winning performances make him a key player.'),
  0.9553366899490356)]

In [None]:
#now doing semantic search and also want to see scores in vector store database.
chroma.similarity_search_with_score(
    query="who among these are a bowler?",
    k=2
    
)

In [16]:
#using metadata se we can filter the documents
chroma.similarity_search_with_score(
    query = "who is the best batsman in IPL Team",
    k = 2,
    filter={'team':'RCB'}
)

[(Document(id='ca83dbc1-ea29-4585-8d8f-757b063f534f', metadata={'source': 'VectorStore\\textDocument\\RCB_virat.txt', 'team': 'RCB'}, page_content='Virat Kohli is one of the most successful and consistent batsmen in IPL history. \nKnown for his aggressive batting style and fitness, he has led the Royal Challengers Bangalore in multiple seasons.'),
  0.6539552211761475)]

In [19]:
#using metadata se we can filter the documents
chroma.similarity_search_with_score(
    query = "",
    k = 2,
    filter={'team':'CSK'}
)

[(Document(id='63447478-e734-41c7-bf9f-f314a18b7880', metadata={'team': 'CSK', 'source': 'VectorStore\\textDocument\\CSK_dhoni.txt'}, page_content='MS Dhoni, famously known as Captain Cool, has led Chennai Super Kings to multiple IPL titles. \nHis finishing skills, wicketkeeping, and leadership are legendary.'),
  1.671717643737793),
 (Document(id='15456d66-b0f7-458d-bb2d-a93040306a54', metadata={'team': 'CSK', 'source': 'VectorStore\\textDocument\\CSK_jadeja.txt'}, page_content='Ravindra Jadeja is a dynamic all-rounder who contributes with both bat and ball. \nRepresenting Chennai Super Kings, his quick fielding and match-winning performances make him a key player.'),
  1.7176929712295532)]

# how to update the vectors or document in vector store database

In [20]:
# update documents
from langchain.schema import Document
updated_doc1 = Document(
    page_content="Virat Kohli, the former captain of Royal Challengers Bangalore (RCB), is renowned for his aggressive leadership and consistent batting performances. He holds the record for the most runs in IPL history, including multiple centuries in a single season. Despite RCB not winning an IPL title under his captaincy, Kohli's passion and fitness set a benchmark for the league. His ability to chase targets and anchor innings has made him one of the most dependable players in T20 cricket.",
    metadata={"team": "RCB"}
)
updated_doc1


Document(metadata={'team': 'RCB'}, page_content="Virat Kohli, the former captain of Royal Challengers Bangalore (RCB), is renowned for his aggressive leadership and consistent batting performances. He holds the record for the most runs in IPL history, including multiple centuries in a single season. Despite RCB not winning an IPL title under his captaincy, Kohli's passion and fitness set a benchmark for the league. His ability to chase targets and anchor innings has made him one of the most dependable players in T20 cricket.")

In [21]:
chroma.update_document(
    document_id='ca83dbc1-ea29-4585-8d8f-757b063f534f',
    document=updated_doc1
    
)

In [22]:
# view documents
chroma.get(include=['embeddings','documents', 'metadatas'])

{'ids': ['63447478-e734-41c7-bf9f-f314a18b7880',
  '15456d66-b0f7-458d-bb2d-a93040306a54',
  '615e61f7-1687-4e20-ab02-2583dff5e43c',
  '6955a72f-fec7-440e-ae7a-e632fdae9574',
  'ca83dbc1-ea29-4585-8d8f-757b063f534f'],
 'embeddings': array([[ 2.11473152e-01, -1.26935169e-01, -4.42349873e-02,
          1.05223708e-01,  2.57716868e-02,  6.31256774e-02,
         -3.60931829e-02, -4.53034639e-02, -2.15939395e-02,
         -3.08576412e-02,  1.77260488e-02,  5.14151566e-02,
         -1.45847239e-02, -7.96229765e-02, -6.55190647e-02,
          1.97027326e-02, -5.37230708e-02,  1.54202729e-01,
         -9.44534466e-02, -1.46766126e-01,  1.88158955e-02,
         -7.19299316e-02, -1.16506830e-01,  3.30059498e-04,
         -3.25244665e-02,  4.46623750e-02, -2.95540988e-02,
         -9.14617106e-02, -5.62446788e-02, -8.33840147e-02,
          5.19707650e-02,  7.68876672e-02,  2.39168093e-01,
         -1.29414052e-01, -1.81555763e-01,  7.97084495e-02,
         -3.59649658e-02, -1.37278035e-01,  6.41

# to delete the vector in vector store database

In [23]:
chroma.delete(
    ids ='ca83dbc1-ea29-4585-8d8f-757b063f534f'
    
)

In [24]:
# view documents
chroma.get(include=['embeddings','documents', 'metadatas'])

{'ids': ['63447478-e734-41c7-bf9f-f314a18b7880',
  '15456d66-b0f7-458d-bb2d-a93040306a54',
  '615e61f7-1687-4e20-ab02-2583dff5e43c',
  '6955a72f-fec7-440e-ae7a-e632fdae9574'],
 'embeddings': array([[ 0.21147315, -0.12693517, -0.04423499,  0.10522371,  0.02577169,
          0.06312568, -0.03609318, -0.04530346, -0.02159394, -0.03085764,
          0.01772605,  0.05141516, -0.01458472, -0.07962298, -0.06551906,
          0.01970273, -0.05372307,  0.15420273, -0.09445345, -0.14676613,
          0.0188159 , -0.07192993, -0.11650683,  0.00033006, -0.03252447,
          0.04466238, -0.0295541 , -0.09146171, -0.05624468, -0.08338401,
          0.05197076,  0.07688767,  0.23916809, -0.12941405, -0.18155576,
          0.07970845, -0.03596497, -0.13727804,  0.06410868, -0.05004751,
          0.04645742, -0.01444582, -0.03357157, -0.09069241, -0.00969644,
          0.04991929, -0.25728947, -0.13009787, -0.05727042,  0.02649825,
          0.15317699, -0.08231554, -0.01988437,  0.08321306, -0.008654