# loading and embedding additional knowledge into a vectorstore

In [58]:
# import needed libraries
from dotenv import dotenv_values
from langchain.document_loaders import DirectoryLoader, PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [59]:
def getOpenAIKey(path=""):
    """Gets OpenAI API key from .env file"""
    paths = ["/Users/samisaf/openai.env", "C:/Users/samis/openai.env", "C:/Users/samisaf/openai.env"]
    if len(path) > 0:
        return dotenv_values(path)['OPENAI_API_KEY']
    else:
        for p in paths:
            if len(dotenv_values(p)) > 0:
                return dotenv_values(p)['OPENAI_API_KEY']
    return None

In [60]:
def loadDirectorySplitDocs(path: str, chunk_size=250, chunk_overlap=25):
    """Loads files in a directory, then splits them"""
    documents = DirectoryLoader(path=path).load()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splitDocs = text_splitter.split_documents(documents)
    print(f"loaded {path}. Got {len(documents)} documents. Splitted into {len(splitDocs)} parts.")
    return splitDocs

In [61]:
# load the PDF documents and then split them
path = './data-kpmp-oct-23'
docs = PyPDFDirectoryLoader(path).load()
len(docs)

224

In [62]:
# create vector dataset 
# then embed the data into a vector store
# last store it locally through chromadb
dbpath = "./db-kpmp-oct-23"
embedding = OpenAIEmbeddings(openai_api_key=getOpenAIKey())
db = Chroma.from_documents(docs, embedding, persist_directory=dbpath)
db.persist()
print(f"created database at {dbpath}")

created database at ./db-kpmp-oct-23


# loading vector store, and performing similarity search

In [63]:
embedding = OpenAIEmbeddings(openai_api_key=getOpenAIKey())
readdb = Chroma(persist_directory='./db-kpmp-oct-23', embedding_function=embedding)
res = readdb.similarity_search("hemoglobin drop")
print(res)
len(res)

[Document(page_content='Document ID: OPS002     5/10/2023     80 \nVersion #: 16  2. Vital sign s are monitored every 15 minutes for an hour, every 30 minutes for an hour, and then \nhourly.  Only the vitals (pulse and blood pressure) at the four -hour mark post -biopsy need to be \nrecorded in REDCap.  Pain level is assessed immediately and 4 hours after th e procedure.   \n3. Participants void their bladder at least once post -procedure to verify no obstruction from clot. A \nbedside commode is permitted, as appropriate after 2 hours of bed rest if initial urine is clear of \ngross blood.  \n4. If there is gross blood in the urine, participants remain at bed rest until urine is clear.  \n5. Participants are observed for 4 hours in the post -procedure recovery area at the participating \nsite. At the end of 4 hours, hemoglobin is measured.  If participant s are stable, they continue to \nmaintain bedrest with bathroom privileges to complete the period of observation.  \n6. If particip

4