In [10]:
import os
import shutil
import openai
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from dotenv import dotenv_values

DATA_PATH = "../Vijay"
CHROMA_PATH = "../Jonathan/chroma"
os.chdir(DATA_PATH)

config = dotenv_values("../../.env")
openai.api_key = config["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = config["OPENAI_API_KEY"]

In [11]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [12]:
documents = load_documents()
chunks = split_text(documents)
save_to_chroma(chunks)

Split 4 documents into 107 chunks.
[Company Name] follows a risk-based approach to customer due diligence, which requires employees to verify the identity of clients, assess their risk profile, and monitor their transactions. Employees must conduct thorough due diligence on new and existing clients to ensure they are not involved in
{'source': '..\\Vijay\\anti-money-laundary-policy.md', 'start_index': 1257}
Saved 107 chunks to chroma.
