Creating a Chroma VectorStore

In [1]:
import getpass
import os
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_chroma import Chroma

In [2]:
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [3]:
loader_docx = Docx2txtLoader("../../data/docs/Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [("#", "Course Title"), ("##", "Lecture Title")] )

In [4]:
pages_md_split = md_splitter.split_text(pages[0].page_content)

In [5]:
for i in range(len(pages_md_split)):
    pages_md_split[i].page_content  = ' '.join(pages_md_split[i].page_content.split())

In [6]:
char_splitter = CharacterTextSplitter(separator=".", chunk_size=500, chunk_overlap=50)
pages_char_split = char_splitter.split_documents(pages_md_split)

In [7]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [8]:
len(pages_char_split)

20

Embedd all document simaltaneously into the vector store (chroma db)
- vectorstore needs to know which embedding function to use to maintain consistency and accuracy in the representation of existing and newly added documents

In [9]:
vectorstore = Chroma.from_documents(documents= pages_char_split, embedding=embeddings, persist_directory = "./vectorstore/rag-practice" )

In [10]:
vectorstore_from_directory = Chroma(persist_directory="./vectorstore/rag-practice", embedding_function=embeddings)

Inspecting and Manding Documents in a vectorstore

In [11]:
vectorstore_from_directory.get(ids ="123ef422-8ec3-4cd7-89ad-e095e77998fd" , include=["embeddings"])

{'ids': ['123ef422-8ec3-4cd7-89ad-e095e77998fd'],
 'embeddings': array([[ 0.00478017, -0.01535145,  0.02508651, ...,  0.02121745,
         -0.01364157, -0.00687695]], shape=(1, 1536)),
 'documents': None,
 'uris': None,
 'included': ['embeddings'],
 'data': None,
 'metadatas': None}

#How to add a document 

In [12]:
added_document = Document(page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis', 
                          metadata={'Course Title': 'Introduction to Data and Data Science', 
                                    'Lecture Title': 'Analysis vs Analytics'})

In [13]:
vectorstore_from_directory.add_documents([added_document])

['7ba74a51-2063-4b25-b6d5-10ce3cd08851']

In [14]:
vectorstore_from_directory.get("9ef73267-b650-4011-82ec-025a21e1095d")

{'ids': ['9ef73267-b650-4011-82ec-025a21e1095d'],
 'embeddings': None,
 'documents': ['Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'Analysis vs Analytics',
   'Course Title': 'Introduction to Data and Data Science'}]}

In [15]:
updated_document = Document(page_content='Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!', 
                            metadata={'Course Title': 'Introduction to Data and Data Science', 
                                     'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'})

In [17]:
vectorstore_from_directory.update_document(document_id="9ef73267-b650-4011-82ec-025a21e1095d", document = updated_document)

In [18]:
vectorstore_from_directory.get("9ef73267-b650-4011-82ec-025a21e1095d")

{'ids': ['9ef73267-b650-4011-82ec-025a21e1095d'],
 'embeddings': None,
 'documents': ['Great! We hope we gave you a good idea about the level of applicability of the most frequently used programming and software tools in the field of data science. Thank you for watching!'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [{'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need',
   'Course Title': 'Introduction to Data and Data Science'}]}

In [19]:
vectorstore_from_directory.delete("9ef73267-b650-4011-82ec-025a21e1095d")

In [20]:
vectorstore_from_directory.get("9ef73267-b650-4011-82ec-025a21e1095d")

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}