# loading and embedding additional knowledge into a vectorstore

In [1]:
# import needed libraries
from dotenv import dotenv_values
from langchain.document_loaders import DirectoryLoader, PyPDFDirectoryLoader, UnstructuredFileLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [6]:
def getOpenAIKey(path=""):
    """Gets OpenAI API key from .env file"""
    paths = ["/Users/samisaf/openai.env", "C:/Users/samis/openai.env", "C:/Users/samisaf/openai.env"]
    if len(path) > 0:
        return dotenv_values(path)['OPENAI_API_KEY']
    else:
        for p in paths:
            if len(dotenv_values(p)) > 0:
                return dotenv_values(p)['OPENAI_API_KEY']
    return None

In [2]:
def loadDirectorySplitDocs(path: str, chunk_size=250, chunk_overlap=25, loader_cls=UnstructuredFileLoader):
    """Loads files in a directory, then splits them"""
    documents = DirectoryLoader(path=path, loader_cls=loader_cls).load()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    splitDocs = text_splitter.split_documents(documents)
    print(f"loaded {path}. Got {len(documents)} documents. Splitted into {len(splitDocs)} parts.")
    return splitDocs

In [4]:
# load the PDF documents and then split them
path = './data-kpmp-oct-23'
docs = loadDirectorySplitDocs(path=path, loader_cls=UnstructuredMarkdownLoader)
len(docs)

Created a chunk of size 288, which is longer than the specified 250
Created a chunk of size 331, which is longer than the specified 250
Created a chunk of size 338, which is longer than the specified 250
Created a chunk of size 344, which is longer than the specified 250
Created a chunk of size 309, which is longer than the specified 250
Created a chunk of size 300, which is longer than the specified 250
Created a chunk of size 398, which is longer than the specified 250
Created a chunk of size 359, which is longer than the specified 250
Created a chunk of size 303, which is longer than the specified 250
Created a chunk of size 418, which is longer than the specified 250
Created a chunk of size 660, which is longer than the specified 250
Created a chunk of size 392, which is longer than the specified 250
Created a chunk of size 265, which is longer than the specified 250
Created a chunk of size 300, which is longer than the specified 250
Created a chunk of size 700, which is longer tha

loaded ./data-kpmp-oct-23. Got 2 documents. Splitted into 397 parts.


397

In [7]:
# create vector dataset 
# then embed the data into a vector store
# last store it locally through chromadb
dbpath = "./db-kpmp-oct-23"
embedding = OpenAIEmbeddings(openai_api_key=getOpenAIKey())
db = Chroma.from_documents(docs, embedding, persist_directory=dbpath)
db.persist()
print(f"created database at {dbpath}")

created database at ./db-kpmp-oct-23


# loading vector store, and performing similarity search

In [8]:
embedding = OpenAIEmbeddings(openai_api_key=getOpenAIKey())
readdb = Chroma(persist_directory='./db-kpmp-oct-23', embedding_function=embedding)
res = readdb.similarity_search("hemoglobin drop")
print(res)
len(res)

[Document(page_content='If participant is deemed unstable by the biopsy operator, they are admitted to the observation unit or inpatient service consistent with their condition and as assessed by the operating clinician, and care is provided as clinically appropriate.\n\nStability is defined by appropriate clinical and laboratory metrics including vital signs, symptoms such as flank pain, gross hematuria or other evidence of post-biopsy bleeding such as abdominal tenderness on the side of the biopsy. A fall in hemoglobin of more than 2 grams/dL or more than 1 gram/dL to less than 9 grams/dL requires extended observation, with follow-up measurement of hemoglobin after an interval of at least 2 hours. Additional observation, including overnight observation or hospital admission, will be done if clinically indicated.\na. Note that an Adverse Event should be reported for a hemoglobin drop of more than 1 gram/dL.\n\nIf hematuria, falling hematocrit, or local pain suggests a hematoma, the pe

4