In [1]:
import chromadb

In [2]:
# Initialize Chroma client and create a collection
client = chromadb.Client()
collection = client.get_or_create_collection("article_collection")

In [3]:
# Example dummy data (documents with metadata)
documents = [
    "Understanding machine learning basics",
    "Advances in deep learning techniques",
    "Neural networks and their applications",
    "AI for healthcare systems",
    "ML models in production",
]

metadata = [
    {"author": "John Doe",      "date": "2022-05-10", "category": "AI"},
    {"author": "Jane Smith",    "date": "2021-11-15", "category": "Deep Learning"},
    {"author": "John Doe",      "date": "2023-03-20", "category": "Neural Networks"},
    {"author": "Alice Johnson", "date": "2020-12-05", "category": "AI"},
    {"author": "Bob Brown",     "date": "2022-08-25", "category": "ML"},
]

In [4]:
# Generate unique IDs for each document (you can use any unique identifiers)
ids = [f"doc_{i}" for i in range(len(documents))]

In [5]:
# Add documents to the collection along with metadata, embeddings, and IDs
collection.add(
    documents=documents,
    metadatas=metadata,
    #embeddings=embeddings,
    ids=ids  # Pass the unique IDs here
)

print("Documents added successfully!")

Documents added successfully!


#### Query 1: Filter by Author

In [7]:
# Query for documents written by "John Doe"
results = collection.query(
    query_texts = ["machine learning"],
    n_results   = 3,
    where       = {"author": "John Doe"}
)

dict(results)

{'ids': [['doc_0', 'doc_2']],
 'distances': [[0.7523747086524963, 0.9583499431610107]],
 'metadatas': [[{'author': 'John Doe', 'category': 'AI', 'date': '2022-05-10'},
   {'author': 'John Doe',
    'category': 'Neural Networks',
    'date': '2023-03-20'}]],
 'embeddings': None,
 'documents': [['Understanding machine learning basics',
   'Neural networks and their applications']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

- Step 1: The query generates an embedding for "machine learning".
- Step 2: The similarity search is conducted across all documents in the collection, calculating their similarity to the generated embedding.
- Step 3: The database identifies the top-N documents (in this case, the top 3) that are most similar to "machine learning".
- Step 4: From those top 3 results, the where filter is applied, and it checks which of these documents also have the author "John Doe".

Identify potential risk !!

In [9]:
# Query for documents written by "John Doe"
results = collection.query(
    query_texts = ["Understanding"],
    n_results   = 3,
    where       = {"author": "Bob Brown"}
)

dict(results)

{'ids': [['doc_4']],
 'distances': [[1.6258388757705688]],
 'metadatas': [[{'author': 'Bob Brown',
    'category': 'ML',
    'date': '2022-08-25'}]],
 'embeddings': None,
 'documents': [['ML models in production']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

#### Query 2: Filter by Category and Date

In [10]:
from datetime import datetime

In [11]:
# Convert the date string to a Unix timestamp
date_string    = "2021-01-01"
date_timestamp = int(datetime.strptime(date_string, "%Y-%m-%d").timestamp())

In [12]:
# Query for documents in the "AI" category, published after 2021
results = collection.query(
    query_texts = ["AI"],  # Example query text
    n_results   = 3,
    where = {
    "$and": [
        {"category": "AI"},
        {"date": {"$gte": date_timestamp}}  # Using the converted timestamp
    ]
}
)

dict(results)

{'ids': [[]],
 'distances': [[]],
 'metadatas': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [None]:
# Example date strings
date_strings = [
    "2022-05-10",
    "2021-11-15",
    "2023-03-20",
    "2020-12-05",
    "2022-08-25",
]

# Convert date strings to timestamps
metadata = [
    {"author": "John Doe", "date": int(datetime.strptime(date_strings[0], "%Y-%m-%d").timestamp()), "category": "AI"},
    {"author": "Jane Smith", "date": int(datetime.strptime(date_strings[1], "%Y-%m-%d").timestamp()), "category": "Deep Learning"},
    {"author": "John Doe", "date": int(datetime.strptime(date_strings[2], "%Y-%m-%d").timestamp()), "category": "Neural Networks"},
    {"author": "Alice Johnson", "date": int(datetime.strptime(date_strings[3], "%Y-%m-%d").timestamp()), "category": "AI"},
    {"author": "Bob Br

| Use Case                     | Description                                                                                                          | Example                                                                                                      |
|------------------------------|----------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|
| User-Specific Content Retrieval | Filtering documents based on user attributes (e.g., preferences, location) to generate tailored responses.        | Retrieving articles related to “machine learning” for users located in “New York” by applying a filter on the user's city attribute. |
| Contextual Relevance         | Enhancing response accuracy by filtering content based on relevant metadata (e.g., topics, categories).             | Generating responses only from documents categorized under “Healthcare” when the query pertains to medical topics.                  |
| Temporal Filtering           | Ensuring that responses are based on the most recent information by filtering based on publication dates.            | In a news summarization application, filtering articles published within the last week to provide up-to-date summaries.                |
