In [2]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader #load
from langchain_text_splitters import RecursiveCharacterTextSplitter#split
from langchain_community.embeddings import OllamaEmbeddings#embeddings
 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
text_loader=TextLoader("speech.txt")
documents=text_loader.load()
documents


[Document(metadata={'source': 'speech.txt'}, page_content="I am honored to be with you today at your commencement from one of the finest universities in the world. I never graduated from college. Truth be told, this is the closest I've ever gotten to a college graduation. Today I want to tell you three stories from my life. That's it. No big deal. Just three stories.\n\nThe first story is about connecting the dots. I dropped out of Reed College after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I really quit. So why did I drop out?\n\nIt started before I was born. My biological mother was a young, unwed college graduate student, and she decided to put me up for adoption. She felt very strongly that I should be adopted by college graduates, so everything was all set for me to be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the last minute that they really wanted a girl.")]

In [4]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=20)
documents_splitted=text_splitter.split_documents(documents)
documents_splitted

[Document(metadata={'source': 'speech.txt'}, page_content='I am honored to be with you today at your commencement from one of the finest universities in the'),
 Document(metadata={'source': 'speech.txt'}, page_content="universities in the world. I never graduated from college. Truth be told, this is the closest I've"),
 Document(metadata={'source': 'speech.txt'}, page_content="is the closest I've ever gotten to a college graduation. Today I want to tell you three stories"),
 Document(metadata={'source': 'speech.txt'}, page_content="you three stories from my life. That's it. No big deal. Just three stories."),
 Document(metadata={'source': 'speech.txt'}, page_content='The first story is about connecting the dots. I dropped out of Reed College after the first 6'),
 Document(metadata={'source': 'speech.txt'}, page_content='after the first 6 months, but then stayed around as a drop-in for another 18 months or so before I'),
 Document(metadata={'source': 'speech.txt'}, page_content='or so b

In [7]:
embeddings=OllamaEmbeddings(model="mxbai-embed-large")
vector_db=Chroma.from_documents(documents=documents_splitted,embedding=embeddings)

In [8]:
vector_db = Chroma.from_documents(
    documents=documents_splitted, 
    embedding=embeddings  # Change 'embeddings' to 'embedding'
)

In [9]:
vector_db

<langchain_chroma.vectorstores.Chroma at 0x2531efc3700>

In [20]:
QUERY= "Who adopted the speaker?"
vectordb=vector_db.similarity_search(QUERY)
vectordb

[Document(id='fded065a-61d4-4d6d-8225-3b8847651c16', metadata={'source': 'speech.txt'}, page_content='be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the'),
 Document(id='11ab0872-792e-4ffe-8ca4-af0286f3b756', metadata={'source': 'speech.txt'}, page_content='student, and she decided to put me up for adoption. She felt very strongly that I should be adopted'),
 Document(id='0094c2d2-d4fd-48fa-924d-b8b98e98c56c', metadata={'source': 'speech.txt'}, page_content='I should be adopted by college graduates, so everything was all set for me to be adopted at birth'),
 Document(id='9e644115-2b45-4a25-a222-22ac17284aaf', metadata={'source': 'speech.txt'}, page_content='It started before I was born. My biological mother was a young, unwed college graduate student, and')]

In [21]:
vectordb[0].page_content

'be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the'

In [17]:
len(vectordb[0].page_content)

95

In [22]:
vectordb

[Document(id='fded065a-61d4-4d6d-8225-3b8847651c16', metadata={'source': 'speech.txt'}, page_content='be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the'),
 Document(id='11ab0872-792e-4ffe-8ca4-af0286f3b756', metadata={'source': 'speech.txt'}, page_content='student, and she decided to put me up for adoption. She felt very strongly that I should be adopted'),
 Document(id='0094c2d2-d4fd-48fa-924d-b8b98e98c56c', metadata={'source': 'speech.txt'}, page_content='I should be adopted by college graduates, so everything was all set for me to be adopted at birth'),
 Document(id='9e644115-2b45-4a25-a222-22ac17284aaf', metadata={'source': 'speech.txt'}, page_content='It started before I was born. My biological mother was a young, unwed college graduate student, and')]

In [None]:
# Persisting the Chroma vector database
vector_db=Chroma.from_documents(documents=documents_splitted,embedding=embeddings,persist_directory="chromadb_data")

In [23]:
#calling saved vector database
vector_db = Chroma(persist_directory="chromadb_data", embedding_function=embeddings)

In [24]:
#size of vector database
vector_db.similarity_search_with_score(QUERY)

[(Document(id='fded065a-61d4-4d6d-8225-3b8847651c16', metadata={'source': 'speech.txt'}, page_content='be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the'),
  170.6599884033203),
 (Document(id='11ab0872-792e-4ffe-8ca4-af0286f3b756', metadata={'source': 'speech.txt'}, page_content='student, and she decided to put me up for adoption. She felt very strongly that I should be adopted'),
  189.53292846679688),
 (Document(id='0094c2d2-d4fd-48fa-924d-b8b98e98c56c', metadata={'source': 'speech.txt'}, page_content='I should be adopted by college graduates, so everything was all set for me to be adopted at birth'),
  195.22096252441406),
 (Document(id='9e644115-2b45-4a25-a222-22ac17284aaf', metadata={'source': 'speech.txt'}, page_content='It started before I was born. My biological mother was a young, unwed college graduate student, and'),
  232.674560546875)]

In [28]:
retriever=vector_db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x000002531EFC3A00>, search_kwargs={})

In [32]:
RESULTS=retriever.invoke(QUERY)
RESULTS

[Document(id='fded065a-61d4-4d6d-8225-3b8847651c16', metadata={'source': 'speech.txt'}, page_content='be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the'),
 Document(id='11ab0872-792e-4ffe-8ca4-af0286f3b756', metadata={'source': 'speech.txt'}, page_content='student, and she decided to put me up for adoption. She felt very strongly that I should be adopted'),
 Document(id='0094c2d2-d4fd-48fa-924d-b8b98e98c56c', metadata={'source': 'speech.txt'}, page_content='I should be adopted by college graduates, so everything was all set for me to be adopted at birth'),
 Document(id='9e644115-2b45-4a25-a222-22ac17284aaf', metadata={'source': 'speech.txt'}, page_content='It started before I was born. My biological mother was a young, unwed college graduate student, and')]

In [33]:
print(RESULTS[0].page_content)

be adopted at birth by a lawyer and his wife. Except that when I popped out they decided at the
