This notebook illustrates how to use an LLM to dynamically generate a metadata filter for a Ragie.ai retrieval. Provide a RAGIE_API_KEY and OPENAI_KEY where noted in the code. You can adjust the mock data, messages, and filter descriptions to better reflect your use case. Your examples and filter descriptions will require some amount of tuning to match your metadata taxonomy. This notebook can be useful for iteratively tweaking to get reliable results.


In [None]:
import json
import time
from datetime import datetime, timedelta
from IPython.display import display, JSON

from ragie import Ragie

# Create a Ragie client
ragie = Ragie(
    auth="RAGIE_API_KEY",
)

now = datetime.now()
datetime_format = "%Y-%m-%d %H:%M:%S"

# Helper function to convert an ISO 8601 date string to a timestamp
# Metadata filter $gt, $lt, $gte, $lte operators operate on numbers,
# while LLMs understand ISO 8601 date strings. This function converts
# an ISO 8601 date string to a timestamp for metadata filtering.
def to_timestamp(iso_date_str):
    iso_date_str = iso_date_str.replace('Z', '+00:00')
    dt = datetime.fromisoformat(iso_date_str)
    timestamp = dt.timestamp()
    return timestamp

# The name of the partition to run the notebook in. This will isolate test data from this run
run_partition = f"dynamic_filter_notebook_{now.strftime(datetime_format).lower().replace(':', '-').replace(' ', '_')}"
print(f"Running in partition: {run_partition}")

# Some fake emails with sent_at dates relative to now. Modify these to match your data
docs = [
    {
        "sender": "John Smith <john.smith@acme.com>",
        "sent_at": (now + timedelta(days=-7)).strftime(datetime_format),
        "subject": "Quarterly Sales Meeting Reminder",
        "body": "Dear Team, please remember that our quarterly sales meeting is scheduled for next Monday at 10 AM. Your attendance is required. Best regards, John.",
        "labels": ["meeting", "sales", "reminder", "acme", "urgent"]
    },
    {
        "sender": "Emily Davis <emily.davis@conteso.com>",
        "sent_at": (now + timedelta(days=-4)).strftime(datetime_format),
        "subject": "Request for Project Update",
        "body": "Hi, could you please provide an update on the XYZ project by the end of the day? We need to prepare for the client meeting tomorrow. Thank you, Emily.",
        "labels": ["project", "urgent", "client", "conteso"]
    },
    {
        "sender": "Michael Brown <michael.brown@financecorp.com>",
        "sent_at": (now + timedelta(days=-3)).strftime(datetime_format),
        "subject": "Budget Approval Needed",
        "body": "Good morning, the budget for Q1 requires your approval. Please review the attached document and let me know if you have any questions. Regards, Michael.",
        "labels": ["finance", "approval", "budget", "financecorp"]
    },
    {
        "sender": "Linda Johnson <linda.johnson@hrdept.com>",
        "sent_at": (now + timedelta(days=-1)).strftime(datetime_format),
        "subject": "Update to Annual Leave Policy",
        "body": "Dear all, please note that there have been updates to our annual leave policy. Refer to the HR portal for details. Best, Linda.",
        "labels": ["HR", "policy", "announcement", "hrdept"]
    },
    {
        "sender": "David Wilson <david.wilson@itservices.com>",
        "sent_at": (now + timedelta(hours=-3)).strftime(datetime_format),
        "subject": "Scheduled System Maintenance",
        "body": "Attention, the IT department will perform system maintenance tonight from 10 PM to 2 AM. Services may be unavailable during this time. Thank you for your understanding, David.",
        "labels": ["IT", "maintenance", "notification", "itservices"]
    }
]

# Load emails into Ragie using "raw" create and storing metadata we'd like to filter on
document_ids = []
for idx, doc in enumerate(docs):
    print(f"Loading doc  {idx + 1} of {len(docs)}")
    print(doc)
    res = ragie.documents.create_raw(request={
        "data": f"sent by: {doc['sender']}\n{doc['subject']}\n{doc['body']}",
        "partition": run_partition,
        "metadata": {
            "sender": doc["sender"],
            "sender_email": doc["sender"].split("<")[1].split(">")[0],
            "sender_name": doc["sender"].split("<")[0].strip(),
            # Convert to a timestamp for metadata filtering
            "sent_at": to_timestamp(doc["sent_at"]),
            "labels": doc["labels"]
        }
    })
    display(JSON(res.model_dump()))
    document_ids.append(res.id)

while len(document_ids) > 0:
    for document_id in document_ids:
        res = ragie.documents.get(document_id=document_id)
        print(f"Document {document_id} status: {res.status}")
        if res.status == "ready":
            document_ids.remove(document_id)
    time.sleep(1)



In [None]:
from string import Template
from openai import OpenAI
from pydantic import BaseModel, Field

# Create an OpenAI client
openai = OpenAI(
    api_key="OPENAI_API_KEY"
)


# This is the user message we want to create a metadata filter for
# Modify these to match the queries you expect
user_messages = [
    "Have I received any emails today?",
    "What have David Wilson and I discussed recently.",
    "Have I received any urgent sales emails this week?",
]

# Use openai structured output to generate metadata filters for the user messages.
# Modify the description to match your metadata schema, being sure to note which operators
# to use for datetime and array fields
class MetadataFilter(BaseModel):
    filter: str = Field(description="The metadata filter, in the Metadata Filter Query Language, generated based on the user message. Filter is formatted as a JSON string. May include filters on sender_email, sender_name, sent_at, and labels. sender_email should always use $eq and $neq operators. sender_name should always use $eq and $neq operators. sent_at should always use $gte or $lte operators for filtering by dates and times. labels should use $in or $nin for filtering by arrays. NEVER guess email addresses or labels not included in the user message.")

# System message provides a description of the metadata schema, query language, 
# and examples of queries and the filter they should produce.
# Modify the metadata schema and examples to match your data.
# Note double $$ in the template are to escape $ for string formatting. Operators are $eq, etc...
system_message_template = Template("""
You will be provided with a user message that will later be passed to RAG system for a semantic search. The RAG system associates metadata with each chunk in its database. The metadata includes:

sender_email: string
the email address of the sender

sender_name: string
the name of the sender

sent_at: string
a timestamp in the format YYYY-MM-DD HH:MM that may be used for filtering

labels: Array<string>
a list of labels that may be used for filtering

You are required to generate a metadata filter, based on the above schema, that can be used to filter the chunks in the database based on the user message. The filter you're generating supports a query language we'll refer to as Metadata Filter Query Language. Here are the available operators for the Metadata Filter Query Language:

$$eq - Equal to (number, string, boolean)
$$ne - Not equal to (number, string, boolean)
$$gt - Greater than (number)
$$gte - Greater than or equal to (number)
$$lt - Less than (number)
$$lte - Less than or equal to (number)
$$in - In array (string or number). This is will match if any of the values in the array match. If multiple need to be matched they must be combined with an $$and operator.
$$nin - Not in array (string or number)

The metadata filters can be combined with AND and OR using the $$and and $$or operators.

- ALWAYS use $$gte or $$lte when filtering by dates and times
- NEVER set date and time values like "2024-10-20 00:00" without a comparison operator
- ALWAYS use $$in or $$nin when filtering by arrays                                   
- NEVER set array value filters like "['urgent', 'sales']" without an $$in or $$nin operator
- If multiple values in an array must match, an $$and and an $$in operator must be used like: "$$and": [{ "labels": {"$$in": ["urgent"]} }, { "labels": {"$$in": ["IT", "HR"]} }]
- NEVER add a top level $$and operators. The filter implicitly combines all top level filters with an AND operator.
- The query language used for filtering is a subset of MongoDB query language, so use your knowledge of MongoDB queries to generate the filter.
- Take it step by step and double check your work, especially when creating date and time filters based on the user message.

Here are some examples of how to generate a filter from a user message:

### Examples

Example 1:
User message: "Have I received any emails today?"
Metadata filter:
{
    "sent_at": {"$$gte": "$start_of_day", "$$lt": "$end_of_day"}
}

Example 2:
User message: "Summarize my recent emails from John Smith."
Metadata filter:
{
    "sender_name": {"$$eq": "John Smith"},
    "sent_at": {"$$gte": "$last_week", "$$lt": "$now"}
}

Example 3:
User message: "Show me emails from labeled urgent also labeled IT or HR."
Metadata filter:
{
    "$$and": [
        {
            "labels": {"$$in": ["urgent"]}
        },
        {
            "labels": {"$$in": ["IT", "HR"]}
        }
    ]
}

### End Examples

Today's date and time are: $now
Last week was: $last_week
Last month was: $last_month

### User message
""")

system_message = system_message_template.substitute(
    now=now.strftime(datetime_format),
    start_of_day=now.replace(hour=0, minute=0, second=0).strftime(datetime_format),
    end_of_day=now.replace(hour=23, minute=59, second=59).strftime(datetime_format),
    last_week=(now + timedelta(days=-7)).strftime(datetime_format),
    last_month=(now + timedelta(days=-30)).strftime(datetime_format),
)

print(system_message)

def gen_filter(user_message: str):
    try:
        completion = openai.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ],
            response_format=MetadataFilter,
        )
        filter = json.loads(completion.choices[0].message.parsed.model_dump().get("filter", {}))
        return filter
    # Structured output will sometimes fail to generate valid JSON, so we'll try again
    # In production you'll likely want to only retry up to some maximum. You'll also
    # want to implement a rate-limiting backoff
    except Exception as e:
        print(f"Error generating filter for user message: {user_message}")
        print(e)
        return gen_filter(user_message)



filters = []
print("Generating filters for user messages:")
for user_message in user_messages:
    filter = gen_filter(user_message)
    filters.append(filter)
    print("User message:", user_message)
    display(JSON(filter))



In [None]:
# Convert date time strings to timestamps for comparison filtering
def convert_date_time_fields(data: dict) -> dict:
    datetime_fields = ["sent_at"]
    if isinstance(data, dict):
        for key, value in data.items():
            if key in datetime_fields:
                converted_date_times = {}
                if not isinstance(value, dict):
                    continue
                for sub_key, sub_value in value.items():
                    converted_date_times[sub_key] = to_timestamp(sub_value)

                data[key] = converted_date_times
            else:
                data[key] = convert_date_time_fields(value)
    if isinstance(data, list):
        for i, item in enumerate(data):
            data[i] = convert_date_time_fields(item)
    return data

# Do a ragie retrieval using the generated metadata filters
print("Querying partition:", run_partition)
for i, filter in enumerate(filters):
    converted_filter = convert_date_time_fields(filter)
    print(f"User message {i + 1}:", user_messages[i])
    print("Metadata filter:")
    display(JSON(converted_filter))
    response = ragie.retrievals.retrieve(
        request={
            "query": user_messages[i],
            "partition": run_partition,
            "filter_": filter,
            "top_k": 5
        }
    )
    print("Retrieved chunk count:", len(response.scored_chunks))
    if not response:
        print("No documents found")
    for scored_chunk in response.scored_chunks:
        display(JSON(scored_chunk.model_dump()))

# Prototype your gen ai features here using the retrieval results