In [1]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import time

In [2]:
# --- Configuration ---
# Set the path to your data 
DATA_FILE = "text.txt"
EMBEDDING_MODEL = "nomic-embed-text"
CHROMA_PATH = "./my_chroma_db"

In [3]:
# --- 1. Load Data ---
loaders = TextLoader(DATA_FILE)
data = loaders.load()

In [4]:
# --- 2. Optimize Splitting for Batching ---
# Increase chunk size significantly. A 10,000-word file is about 60,000 characters.
# By making the chunk size larger, you reduce the total number of documents (and API calls).
# Using 4000 characters means approximately 25-30 documents are reduced to just 5-6 documents.
# This cuts down connection overhead significantly.
print("Splitting documents with larger chunk size...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,           # set chunk size to 4000
    chunk_overlap=200          # Added small overlap for context
)
splitter = text_splitter.split_documents(data) 
print(f"File split into {len(splitter)} chunks.")

Splitting documents with larger chunk size...
File split into 3 chunks.


In [6]:
# --- 3. Embedding and Indexing ---
print(f"Initializing embedding model: {EMBEDDING_MODEL}")
embedding = OllamaEmbeddings(model=EMBEDDING_MODEL)

# Start timer to measure the speed improvement
start_time = time.time()

# This single call will now make fewer, larger requests to Ollama
print("Starting embedding process with Chroma...")
vectordb = Chroma.from_documents(
    documents=splitter, 
    embedding=embedding,
    persist_directory=CHROMA_PATH
)


end_time = time.time()
elapsed_time = end_time - start_time

print("Embedding Complete!")
print(f"Total time taken: {elapsed_time:.2f} seconds.")
print(f"Vector store saved to: {CHROMA_PATH}")

Initializing embedding model: nomic-embed-text
Starting embedding process with Chroma...
Embedding Complete!
Total time taken: 6.58 seconds.
Vector store saved to: ./my_chroma_db


In [11]:
# Loading vector files from local 
new_vector_db = Chroma(
    persist_directory="my_ollama_chroma_index",
    embedding_function=embedding
    )