In [1]:
import os
from pathlib import Path
from langchain.vectorstores import ZepVectorStore
from langchain.vectorstores.zep import CollectionConfig
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

In [3]:
# Load environment variables
load_dotenv()

ZEP_API_URL = os.environ.get('ZEP_API_URL')
ZEP_API_KEY = os.environ.get('ZEP_API_KEY')
JSON_DATA_PATH = 'training-data/json'
PDF_DATA_PATH = 'training-data/pdf'

In [6]:
# Define a splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False
)

# Load json training data
json_loader = DirectoryLoader(
    path=JSON_DATA_PATH,
    glob="*.json",
    show_progress=True,
    use_multithreading=True,
    loader_cls=TextLoader
)

# Load pdf training data
pdf_loader = PyPDFDirectoryLoader(PDF_DATA_PATH)

In [None]:
# Merge both training data
docs = []
loaders = [json_loader, pdf_loader]

for loader in loaders:
    docs.extend(loader.load_and_split(text_splitter=text_splitter))

docs[0].page_content

In [8]:
# Create embeddings
embeddings = OpenAIEmbeddings()


# Collection config is needed if we're creating a new Zep Collection
collection_config = CollectionConfig(
    name='dumbledore',
    description="documents for quipbot regarding online dating and social media text game",
    metadata={},
    is_auto_embedded=False,
    embedding_dimensions=1536
)

In [None]:
vectordb = ZepVectorStore.from_documents(
    documents=docs,
    collection_name='dumbledore',
    embedding=embeddings,
    config=collection_config,
    api_url=ZEP_API_URL,
    api_key=ZEP_API_KEY,
)