In [12]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.document_loaders import TextLoader

In [13]:
loader = TextLoader('.//case_files//5022_2016_Judgement_06-Sep-2017.txt')
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)


In [14]:
texts

[Document(page_content='NON-REPORTABLE\n                                        IN THE SUPREME COURT OF INDIA\n                                         CIVIL APPELLATE JURISDICTION\n\n\n                                      CIVIL APPEAL NO(S). 12847 OF 2017\n                              [@ SPECIAL LEAVE PETITION (C) NO(S).11275/2016]\n\n\n                         RAJBIR AND   ORS.                         APPELLANT(S)\n\n                                                     VERSUS\n\n                         STATE OF HARYANA AND ORS.                 RESPONDENT(S)\n\n                                                  WITH', metadata={'source': './/case_files//5022_2016_Judgement_06-Sep-2017.txt'}),
 Document(page_content='CIVIL APPEAL NO(S).14637-645 OF 2017 @ SLP(C) NOS.24124-24132/2016,\n     CIVIL APPEAL NO(S). 14601 OF 2017 @ SLP(C) No.11064/2016,\n     CIVIL APPEAL NO(S). 14599 OF 2017 @ SLP(C) No.11274/2016,\n     CIVIL APPEAL NO(S). 14611-14632 OF 2017 @ SLP(C) No.17260-81/2016,\n 

In [19]:
no_of_documents = len(texts)
max_batch_size = 2 #Have to assign this manually as 
print("Number of documents = ", no_of_documents)
print("Max batch Size = ", max_batch_size)

Number of documents =  10
Max batch Size =  2


In [22]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

from my_embedding_function import CustomEmbeddings

persist_directory = './/chromadb_langchain'
custom_embedding_function = CustomEmbeddings()
vectorstore = Chroma("langchain_store", custom_embedding_function)
max_batch_size = 2
print(f"Max batch size: {max_batch_size}")
vectordb = Chroma.from_documents(documents=texts[:2], 
                                 embedding=custom_embedding_function, 
                                 persist_directory=persist_directory,
                                 )


Max batch size: 2


In [25]:
from tqdm import tqdm
import time

document_range = range(2, no_of_documents, max_batch_size)

# Create a tqdm instance with total being the length of the range
progress_bar = tqdm(total=len(document_range), desc="Processing Documents")

# Loop through the documents
for i in document_range:
    # Simulate adding documents to vectordb
    # Replace this with your actual code
    vectordb.add_documents(texts[i:i+max_batch_size])
    
    # Update the progress bar
    progress_bar.update(1)

# Close the progress bar
progress_bar.close()

Processing Documents: 100%|██████████| 4/4 [00:03<00:00,  1.06it/s]
