In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

import chromadb

client = chromadb.PersistentClient("../job_profiles_db")
collection = client.get_or_create_collection("job_profiles",metadata={"hnsw:batch_size":10000})

def create_vectorstore_with_batching(documents, batch_size=10):  # Reduced batch size
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    vectorstore = Chroma(
        # persist_directory="job_profiles_db",
        client=client,
        embedding_function=embeddings,
        collection_name="job_profiles"
    )
    
    try:
        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            print(f"Processing batch {i//batch_size + 1}, size: {len(batch)}")
            
            vectorstore.add_documents(documents=batch)
            
            # Add verification step
            current_count = vectorstore._collection.count()
            print(f"Current document count: {current_count}")
            
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        
    return vectorstore

loader = CSVLoader(file_path="../data/job profiles/2025-02-07_profiles.csv", content_columns=["title", "overview"], encoding="utf-8-sig")
documents = loader.load()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)

print(f"Documents adding: {len(chunks)}")
print('creating vector store..')
vectorstore = create_vectorstore_with_batching(chunks)

print(f"Collection count: {vectorstore._collection.count()}")

Documents adding: 525
creating vector store..
Processing batch 1, size: 10
Current document count: 10
Processing batch 2, size: 10
Current document count: 20
Processing batch 3, size: 10
Current document count: 30
Processing batch 4, size: 10
Current document count: 40
Processing batch 5, size: 10
Current document count: 50
Processing batch 6, size: 10
Current document count: 60
Processing batch 7, size: 10
Current document count: 70
Processing batch 8, size: 10
Current document count: 80
Processing batch 9, size: 10
Current document count: 90
Processing batch 10, size: 10
Current document count: 100
Processing batch 11, size: 10
Current document count: 110
Processing batch 12, size: 10
Current document count: 120
Processing batch 13, size: 10
Current document count: 130
Processing batch 14, size: 10
Current document count: 140
Processing batch 15, size: 10
Current document count: 150
Processing batch 16, size: 10
Current document count: 160
Processing batch 17, size: 10
Current docume