ChromaDB

In [3]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter

loader = TextLoader("sample.txt")
doc = loader.load()
doc


[Document(metadata={'source': 'sample.txt'}, page_content="LangChain is an open-source framework designed to help developers build applications powered by large language models (LLMs).\n\nIt provides tools for loading, processing, and managing different types of data sources such as text files, PDFs, web pages, and databases.\n\nUsing LangChain's document loaders, we can efficiently fetch data from multiple sources and utilize it for various AI-based tasks.\n")]

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=10)
docs = text_splitter.split_documents(doc)
docs

[Document(metadata={'source': 'sample.txt'}, page_content='LangChain is an open-source framework designed to help developers build applications powered by'),
 Document(metadata={'source': 'sample.txt'}, page_content='by large language models (LLMs).'),
 Document(metadata={'source': 'sample.txt'}, page_content='It provides tools for loading, processing, and managing different types of data sources such as'),
 Document(metadata={'source': 'sample.txt'}, page_content='such as text files, PDFs, web pages, and databases.'),
 Document(metadata={'source': 'sample.txt'}, page_content="Using LangChain's document loaders, we can efficiently fetch data from multiple sources and utilize"),
 Document(metadata={'source': 'sample.txt'}, page_content='utilize it for various AI-based tasks.')]

In [5]:
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model='gemma2:2b')

  embeddings = OllamaEmbeddings(model='gemma2:2b')


In [6]:
from langchain_chroma import Chroma
vectordb = Chroma.from_documents(documents=docs, embedding=embeddings)
vectordb

<langchain_chroma.vectorstores.Chroma at 0x158e2804be0>

In [9]:
query = "What is Langchain"
query_result = vectordb.similarity_search(query= query)
query_result[0].page_content

'by large language models (LLMs).'

In [10]:
# Save to Disk
vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory='./chroma_db')

In [11]:
#  load vector db
chroma_db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

In [12]:
docs = chroma_db.similarity_search(query)
docs[0].page_content

'by large language models (LLMs).'

In [14]:
docs_score = vectordb.similarity_search_with_score(query)
docs_score

[(Document(id='dbf85943-5e3a-4a56-a0c1-6bf5ba2f7eef', metadata={'source': 'sample.txt'}, page_content='by large language models (LLMs).'),
  8560.378163357245),
 (Document(id='34b65273-e607-4b0a-ae8d-1d28f51feaa4', metadata={'source': 'sample.txt'}, page_content='utilize it for various AI-based tasks.'),
  9334.688244095338),
 (Document(id='80e030e6-e9fb-4b32-92f2-09cd971e94b2', metadata={'source': 'sample.txt'}, page_content="Using LangChain's document loaders, we can efficiently fetch data from multiple sources and utilize"),
  11855.716835248231),
 (Document(id='8f1ea4d7-6b73-4fe2-83cc-9c3009263fff', metadata={'source': 'sample.txt'}, page_content='such as text files, PDFs, web pages, and databases.'),
  12009.64020022119)]

#### As a Retriever

We can aslos convert the vectore store into a Retriever calss. This allows us to easily use it in other LangChain methods, wich largely work with retrievers.

In [16]:
retriever = vectordb.as_retriever()
query_results = retriever.invoke(query)
query_results[0].page_content

'by large language models (LLMs).'