In [1]:
import chromadb
import pandas as pd

from chromadb.config import Settings

Source of data is from [here](https://www.kaggle.com/datasets/nulldata/medium-post-titles)

In [2]:
# Load the data
df = pd.read_csv('medium_post_titles.csv')

# dropping the rows with missing values
df = df.dropna()
# Dropping Subtitle trucated = true
df = df[~df['subtitle_truncated_flag']]

# Define the topics of interest
topics_of_interest = ['data-science']

# Filter the data
df = df[df['category'].isin(topics_of_interest)]

# combining title and subtitle
df['text'] = df['title'] + ' ' + df['subtitle']

# creating metadata
df['meta'] = df.apply(lambda x: 
    {
        'text': x['text'],
        'category': x['category']
    },
    axis=1
)

In [3]:
chroma_client = chromadb.Client(
    Settings(
        persist_directory='medium-chroma-db',
        chroma_db_impl='duckdb+parquet',
    )
)

try:
    article_collection = chroma_client.create_collection(name='medium_posts')
except Exception:
    print("Collection probably already exists")
    article_collection = chroma_client.get_collection(name='medium_posts')

In [4]:
article_collection.upsert(
    ids=[str(i) for i in df.index.tolist()],
    documents=df['text'].tolist(),
    metadatas=df['meta'].tolist()
)

In [5]:
query_string = "best ai library?"
article_collection.query(query_texts=query_string, n_results=1)

{'ids': [['85765']],
 'embeddings': None,
 'documents': [['The A-Z of AI and Machine Learning: Comprehensive Glossary Ultimate Terminology You Need to Know']],
 'metadatas': [[{'text': 'The A-Z of AI and Machine Learning: Comprehensive Glossary Ultimate Terminology You Need to Know',
    'category': 'data-science'}]],
 'distances': [[0.9730660319328308]]}