In [1]:
import chromadb
from chromadb.config import Settings
import os
import glob
import uuid
from tqdm import tqdm

In [19]:
chroma_client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                persist_directory="chroma_data"
                                ))

In [21]:
collection = chroma_client.create_collection(name="wiki_articles")

No embedding_function provided, using default embedding function: DefaultEmbeddingFunction https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2


In [22]:
article_body_files=glob.glob("data/*/bodyText.txt")

In [23]:
def process_document(document_text, document_link):
    processed_documents=[x for x in document_text.split('\n') if x.strip()]
    ids=[uuid.uuid4().hex for _ in range(len(processed_documents))]
    metadata=[{"source": document_link}]*len(processed_documents)
    
    return processed_documents, ids, metadata

In [24]:
for article_body_file in tqdm(article_body_files[:20]):
    article_link_file=os.path.join(os.path.dirname(article_body_file), 'articleLink.txt')
    
    with open(article_body_file, 'r') as file:
        document_text = file.read()

    with open(article_link_file, 'r') as file:
        document_link = file.read()
    
    processed_documents, ids, metadata = process_document(document_text, document_link)
    
    print(f"Adding page {document_link} to db")
    collection.add(documents=processed_documents, metadatas=metadata, ids=ids)

  0%|                                                                                                                                                              | 0/20 [00:00<?, ?it/s]

Adding page https://en.wikipedia.org//wiki/Jerzy_Neyman to db


  5%|███████▌                                                                                                                                              | 1/20 [00:00<00:04,  4.35it/s]

Adding page https://en.wikipedia.org//wiki/Sn%C4%9B%C5%BEka to db


 15%|██████████████████████▌                                                                                                                               | 3/20 [00:04<00:25,  1.51s/it]

Adding page https://en.wikipedia.org//wiki/Absolute_Monarchs to db
Adding page https://en.wikipedia.org//wiki/On_Spirals to db
Adding page https://en.wikipedia.org//wiki/SignWriting to db


 25%|█████████████████████████████████████▌                                                                                                                | 5/20 [00:05<00:14,  1.03it/s]

Adding page https://en.wikipedia.org//wiki/Ch%E1%BB%AF_Qu%E1%BB%91c_ng%E1%BB%AF to db


 30%|█████████████████████████████████████████████                                                                                                         | 6/20 [00:08<00:19,  1.40s/it]

Adding page https://en.wikipedia.org//wiki/Love_Sux to db


 35%|████████████████████████████████████████████████████▌                                                                                                 | 7/20 [00:09<00:15,  1.22s/it]

Adding page https://en.wikipedia.org//wiki/Israel_Houghton to db


 40%|████████████████████████████████████████████████████████████                                                                                          | 8/20 [00:09<00:11,  1.05it/s]

Adding page https://en.wikipedia.org//wiki/Lee_Roy_Selmon to db


 45%|███████████████████████████████████████████████████████████████████▌                                                                                  | 9/20 [00:09<00:08,  1.31it/s]

Adding page https://en.wikipedia.org//wiki/1904_World_Wrestling_Championships to db
Adding page https://en.wikipedia.org//wiki/Molla_Panah_Vagif to db


 55%|█████████████████████████████████████████████████████████████████████████████████▉                                                                   | 11/20 [00:09<00:04,  2.15it/s]

Adding page https://en.wikipedia.org//wiki/Kreuzlingen to db


 65%|████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 13/20 [00:10<00:02,  2.61it/s]

Adding page https://en.wikipedia.org//wiki/Eustache_Charles_d%27Aoust to db
Adding page https://en.wikipedia.org//wiki/Construction_and_management_simulation to db


 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 14/20 [00:10<00:02,  2.69it/s]

Adding page https://en.wikipedia.org//wiki/Net_income to db


 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                             | 16/20 [00:11<00:01,  3.11it/s]

Adding page https://en.wikipedia.org//wiki/Red_herring to db
Adding page https://en.wikipedia.org//wiki/Google_Clips to db


 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌       | 19/20 [00:11<00:00,  5.67it/s]

Adding page https://en.wikipedia.org//wiki/Lauren_Beck to db
Adding page https://en.wikipedia.org//wiki/Battle_of_Jinan to db
Adding page https://en.wikipedia.org//wiki/Law_of_Denmark to db


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:11<00:00,  1.70it/s]


In [27]:
collection.query(query_texts=['who won the battle of jinan'], n_results=2)

{'ids': [['182d064af4414ac98ccc8a2c33b627a9',
   'f20402594c784b989286c8303b1e51c2']],
 'embeddings': None,
 'documents': [['The Battle of Jinan  was a critical engagement fought between the Kuomintang (KMT or Chinese Nationalist Party) and the Chinese Communist Party (CCP) from September 16 to September 24, 1948 during the Chinese Civil War. The communist Eastern China Field Army besieged and finally captured the city of Jinan, the capital of Shandong Province and a major urban center as well as a transportation hub in northeastern China that had a population of about 600,000 at the time of the battle.  The communist victory set the stage for the Huaihai Campaign.',
   'Jinan was the first major urban center to be captured by the communists (the Siege of Changchun had begun on May 23 already, but the city was only captured on October 19, 1948). Zhou Enlai hence referred to the Battle of Jinan as the starting point for the "three great battles" (Chinese: 三大战役; pinyin: sān dà zhànyì), n

In [20]:
collection.peek()

{'ids': [], 'embeddings': [], 'documents': [], 'metadatas': []}

In [17]:
collection.delete()