# Automate filtering with LLMs

This notebook is an end-to-end implementation of the workflow presented in the [Automate filtering with LLMs tutorial](https://qdrant.tech/documentation/database-tutorials/automate-filtering-with-llms/). It presents how Large Language Models may be used to automate the filters extraction from text queries, considering only the filters that make sense in the context of a specific Qdrant collection.

The tutorial uses [Anthropic](https://www.anthropic.com/) Claude 3.5 Sonnet combined with [Instructor](https://python.useinstructor.com/) for structured output. 

In [1]:
!pip install "instructor[anthropic]" "qdrant-client>=1.12.0"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import instructor
from anthropic import Anthropic

anthropic_client = instructor.from_anthropic(
    client=Anthropic(
        api_key="YOUR_API_KEY"
    )
)

In [3]:
from qdrant_client import models

qdrant_filter = anthropic_client.messages.create(
    model="claude-3-5-sonnet-latest",
    response_model=models.Filter,
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": "red T-shirt"
        }
    ],
)
qdrant_filter

Filter(should=FieldCondition(key='product_type', match=MatchValue(value='t-shirt'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None), min_should=None, must=FieldCondition(key='color', match=MatchValue(value='red'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None), must_not=None)

In [4]:
from qdrant_client import QdrantClient

client = QdrantClient("http://localhost:6333")

In [5]:
# This tutorial assumes there is a "test_filter" collection already.
# You can simply create one by uncommenting the following snippet.
# Since we don't really filter the collection here, but just create
# filters, there is no need to add any data to it.

# client.create_collection(
#     "test_filter", 
#     vectors_config=models.VectorParams(
#         size=2, 
#         distance=models.Distance.COSINE,
#     )
# )
# client.create_payload_index("test_filter", "color", models.PayloadSchemaType.KEYWORD)
# client.create_payload_index("test_filter", "fabric", models.PayloadSchemaType.KEYWORD)
# client.create_payload_index("test_filter", "price", models.PayloadSchemaType.FLOAT)
# client.create_payload_index("test_filter", "city.name", models.PayloadSchemaType.KEYWORD)
# client.create_payload_index("test_filter", "city.location", models.PayloadSchemaType.GEO)

In [6]:
collection_info = client.get_collection(collection_name="test_filter")
indexes = collection_info.payload_schema
indexes

{'fabric': PayloadIndexInfo(data_type=<PayloadSchemaType.KEYWORD: 'keyword'>, params=None, points=0),
 'city.location': PayloadIndexInfo(data_type=<PayloadSchemaType.GEO: 'geo'>, params=None, points=0),
 'price': PayloadIndexInfo(data_type=<PayloadSchemaType.FLOAT: 'float'>, params=None, points=0),
 'color': PayloadIndexInfo(data_type=<PayloadSchemaType.KEYWORD: 'keyword'>, params=None, points=0),
 'city.name': PayloadIndexInfo(data_type=<PayloadSchemaType.KEYWORD: 'keyword'>, params=None, points=0)}

In [7]:
formatted_indexes = "\n".join([
    f"- {index_name} - {index.data_type.name}"
    for index_name, index in indexes.items()
])
print(formatted_indexes)

- fabric - KEYWORD
- city.location - GEO
- price - FLOAT
- color - KEYWORD
- city.name - KEYWORD


In [8]:
qdrant_filter = anthropic_client.messages.create(
    model="claude-3-5-sonnet-latest",
    response_model=models.Filter,
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": (
                "<query>color is red</query>"
                f"<indexes>\n{formatted_indexes}\n</indexes>"
            )
        }
    ],
)
qdrant_filter

Filter(should=None, min_should=None, must=FieldCondition(key='color', match=MatchValue(value='red'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None), must_not=None)

In [9]:
qdrant_filter = anthropic_client.messages.create(
    model="claude-3-5-sonnet-latest",
    response_model=models.Filter,
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": (
                "<query>fruit salad with no more than 100 calories</query>"
                f"<indexes>\n{formatted_indexes}\n</indexes>"
            )
        }
    ],
)
qdrant_filter

Filter(should=None, min_should=None, must=FieldCondition(key='price', match=None, range=Range(lt=None, gt=None, gte=None, lte=100.0), geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None), must_not=None)

In [10]:
SYSTEM_PROMPT = """
You are extracting filters from a text query. Please follow the following rules:
1. Query is provided in the form of a text enclosed in <query> tags.
2. Available indexes are put at the end of the text in the form of a list enclosed in <indexes> tags.
3. You cannot use any field that is not available in the indexes.
4. Generate a filter only if you are certain that user's intent matches the field name.
5. Prices are always in USD.
6. It's better not to generate a filter than to generate an incorrect one.
"""

In [11]:
qdrant_filter = anthropic_client.messages.create(
    model="claude-3-5-sonnet-latest",
    response_model=models.Filter,
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": SYSTEM_PROMPT.strip(),
        },
        {
            "role": "assistant",
            "content": "Okay, I will follow all the rules."
        },
        {
            "role": "user",
            "content": (
                "<query>fruit salad with no more than 100 calories</query>"
                f"<indexes>\n{formatted_indexes}\n</indexes>"
            )
        }
    ],
)

qdrant_filter

Filter(should=None, min_should=None, must=FieldCondition(key='calories', match=None, range=Range(lt=None, gt=None, gte=None, lte=100.0), geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None), must_not=None)

In [12]:
qdrant_filter = anthropic_client.messages.create(
    model="claude-3-5-sonnet-latest",
    response_model=models.Filter,
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": SYSTEM_PROMPT.strip(),
        },
        {
            "role": "assistant",
            "content": "Okay, I will follow all the rules."
        },
        {
            "role": "user",
            "content": (
                "<query>"
                "white T-shirt available no more than 30 miles from London, "
                "but not in the city itself, below $15.70, not made from polyester"
                "</query>\n"
                "<indexes>\n"
                f"{formatted_indexes}\n"
                "</indexes>"
            )
        },
    ],
)
qdrant_filter

Filter(should=None, min_should=None, must=[FieldCondition(key='color', match=MatchValue(value='white'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None), FieldCondition(key='price', match=None, range=Range(lt=15.7, gt=None, gte=None, lte=None), geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None), FieldCondition(key='city.location', match=None, range=None, geo_bounding_box=None, geo_radius=GeoRadius(center=GeoPoint(lon=-0.1278, lat=51.5074), radius=48280.0), geo_polygon=None, values_count=None), FieldCondition(key='fabric', match=MatchExcept(except_=['polyester']), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None)], must_not=[FieldCondition(key='city.name', match=MatchValue(value='London'), range=None, geo_bounding_box=None, geo_radius=None, geo_polygon=None, values_count=None)])