In [20]:
import pandas as pd
import chromadb
from openai import OpenAI
import os
from dotenv import load_dotenv
from chromadb.utils import embedding_functions
load_dotenv()
openai_client = OpenAI(api_key=os.getenv('OPEN_AI_API_KEY'))
from chromadb.utils import embedding_functions

In [21]:
text_file_path = '../data/text-files'

def split_text(text, chunk_size=1000, chunk_overlap=20):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - chunk_overlap
    return chunks

def load_documents_from_directory(directory_path):
    print("==== Loading documents from directory ====")
    documents = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(directory_path, filename), "r", encoding="utf-8"
            ) as file:
                documents.append({"id": filename, "text": file.read()})
    return documents


In [22]:
# creating the client
chroma_client = chromadb.PersistentClient('../data/chroma_persist.db')
collection_name = 'test_collection'
default_ef = embedding_functions.DefaultEmbeddingFunction()
collection = chroma_client.get_or_create_collection(collection_name, embedding_function=default_ef)

In [17]:
# Loading the txt files from the document into the chroma db
documents = load_documents_from_directory(text_file_path)
print(f'Loaded: {len(documents)} documents')

# Split the documents into chunks
chunked_documents = []
print("==== Splitting docs into chunks ====")
for doc in documents:
    chunks = split_text(doc["text"])
    for i, chunk in enumerate(chunks):
        chunked_documents.append({"id": f"{doc['id']}_chunk{i+1}", "text": chunk})

# Generate embeddings for the document chunks
for doc in chunked_documents:
    print("==== Generating embeddings... ====")
    doc["embedding"] = get_openai_embedding(doc["text"])

==== Loading documents from directory ====
Loaded: 21 documents
==== Splitting docs into chunks ====


In [19]:
for doc in documents:
    print(doc)
    doc[]
    break

{'id': '05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt', 'text': 'Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding.”\n\nPando was co-launched by Jayak

In [18]:
# Load the chunked documents into the database
for doc in chunked_documents:
    print("==== Inserting chunks into db;;; ====")
    collection.upsert(
        ids=[doc["id"]], documents=[doc["text"]], embeddings=[doc["embedding"]]
    )

==== Inserting chunks into db;;; ====


KeyError: 'embedding'

In [None]:
documents = [
{'id' : 'doc1', 'text' : 'Hello world'},
{'id' : 'doc2', 'text' : 'How are you doing today'},
{'id' : 'doc3', 'text' : 'Goodbye, See you later'},
{'id' : 'doc4', 'text' : 'Welcome again!'},
]

# Adding the documents into the collection

for doc in documents:
    collection.upsert(ids = doc['id'], documents = doc['text'])

query = "Hello"
results = collection.query(query_texts = [query], n_results=3)
