In [11]:
import logging

import chromadb
import numpy as np
import pandas as pd
from chromadb.config import Settings
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
embeddings = GPT4AllEmbeddings(
    model_name="all-MiniLM-L6-v2.gguf2.f16.gguf",
    n_threads=8,
)

Failed to load libllamamodel-mainline-cuda.so: dlopen: libcudart.so.11.0: cannot open shared object file: No such file or directory
Failed to load libllamamodel-mainline-cuda-avxonly.so: dlopen: libcudart.so.11.0: cannot open shared object file: No such file or directory


In [4]:
articles_df = pd.read_parquet("../data/input/wikipedia_articles.parquet")

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""],
    length_function=len,
)

In [6]:
documents = text_splitter.create_documents(
    articles_df["summary"].tolist(),
    metadatas=[
        {"source": str(row.Index), "title": row.title, "url": row.url}
        for row in articles_df.itertuples()
    ],
)

In [7]:
logging.info(f"Split {len(articles_df)} articles into {len(documents)} chunks.")

INFO:root:Split 4573 articles into 7873 chunks.


In [8]:
chroma_client = chromadb.PersistentClient(
    path="../data/database/wikipedia.db",
    settings=Settings(allow_reset=True),
)

INDEX_NAME = "wikipedia-index"
vector_store = Chroma(
    client=chroma_client,
    collection_name=INDEX_NAME,
    embedding_function=embeddings,
)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [10]:
# vector_store.reset_collection()

In [None]:
document_batch_size = 100
document_batches = np.array_split(documents, len(documents) // document_batch_size + 1)

for document_batch in tqdm(document_batches, desc="Ingesting documents"):
    vector_store.add_documents(document_batch)

Ingesting documents:  23%|██▎       | 18/79 [03:38<11:02, 10.86s/it]

In [None]:
vector_store._chroma_collection.count()

5318

In [None]:
vector_store.search(
    "donald trump",
    search_type="similarity",
    k=5,
)