In [12]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OpenAIEmbeddings, GPT4AllEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import OpenAI, CTransformers
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from dotenv import load_dotenv
import os

In [13]:
load_dotenv()

project_directory = os.getenv("PROJECT_DIR")
data_directory = os.path.join(project_directory, r"Data\cleaned_data")

model_file = os.path.join(project_directory, r'Models\llama-2-7b-chat.Q8_0.gguf')
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API")

In [None]:
text_loader_kwargs = {'encoding': 'utf-8'}
loader = DirectoryLoader(data_directory, glob="./*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [14]:
persist_directory = 'Data/db'

embedding = GPT4AllEmbeddings(model_name='all-MiniLM-L6-v2.gguf2.f16.gguf', device="cpu")
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [None]:
batch_size = 100

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    print("Start batch\n", batch)
    vectordb.add_documents(documents=batch)
    print("Finish batch\n", batch)
    vectordb.persist()

vectordb.persist()