In [1]:
import instructor
from openai import OpenAI
from typing import Iterable, Literal, List, Optional, Tuple
from pydantic import BaseModel, Field 
import pandas as pd
from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv())

In [2]:
client = instructor.from_openai(OpenAI(), mode=instructor.Mode.TOOLS)

class Facet(BaseModel):
    """
    Facet is a search filter e.g. color, size, price, etc.
    """
    name: str = Field(..., description="Common filters in search e.g. color, size, price, gender, brand, fit, cut,datetime, season, pattern, material, dimension, etc.")
    values: List[str] = Field(..., description="Values of the facet")

class QueryAnnotation(BaseModel):
    query: str = Field(..., description="Query to search for relevant content")
    simplified_query: str = Field(..., description="Simplified query e.g. wristwatches for men -> men's wristwatches, red dress shirt for husband -> men's red dress shirt")
    hypothetical_product_description: Optional[List[str]] = Field(..., description="Given a query, generate unique and diverse product descriptions that satisfy the query")
    keywords: List[str] = Field(..., description=   "Keywords to search for")
    expanded_keywords: List[str] = Field(..., description="Expanded keywords to search for e.g. synonyms, related words, etc.")
    intent: Literal["informational", "navigational", "transactional"] = Field(..., description="Intent of the user")
    translated_query: Optional[str] = Field(..., description="English query to search for")
    iso_language_code: Optional[str] = Field(..., description="ISO language code to search e.g. en-US, in-IN")
    facets: List[Facet] = Field(..., description="Facets which are used to filter the search results")

    async def execute(self):
        print(
            f"Decomposing query `{self.query}` into `{self.simplified_query}` and `{self.semantic_query}`"
        )

In [3]:
df = pd.read_json("amzn_esci_train_query_info.jsonl", lines=True)
df.head()

Unnamed: 0,qid,q_text
0,1,!awnmower tires without rims
1,5,# 10 self-seal envelopes without window
2,6,# 2 pencils not sharpened
3,9,# mom life
4,11,#1 best and not expensive bath back brush crea...


query = f(queries, products, user_actions)

In [4]:
def query_annotation(data: str) -> Iterable[QueryAnnotation]:
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=Iterable[QueryAnnotation],
        messages=[
            {
                "role": "user",
                "content": f"Consider the user query below: '\n{data}' and annotate the query",
            },
        ],
        max_tokens=1000,
        temperature=0.0,
        seed=42,
    )

def image_query_annotation(url: str, query: str) -> Iterable[QueryAnnotation]:
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=Iterable[QueryAnnotation],
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": url},
                    },
                    {
                        "type": "text",
                        "text": f"Analyze the image and annotate the query: {query}",
                    },
                ],
            }
        ],
        max_tokens=1000,
        temperature=0.0,
        seed=42,
    )

for query in query_annotation("I'm looking for a red dress shirt for my husband for a Bertem wedding"):
    print(query.model_dump_json(indent=2))

{
  "query": "I'm looking for a red dress shirt for my husband for a Bertem wedding",
  "simplified_query": "men's red dress shirt",
  "hypothetical_product_description": [
    "A stylish men's red dress shirt perfect for weddings and formal occasions, featuring a slim fit and high-quality fabric.",
    "Elegant red dress shirt for men, ideal for weddings, with a classic collar and button-down design.",
    "Men's red dress shirt crafted from breathable material, suitable for weddings and special events, offering comfort and style."
  ],
  "keywords": [
    "red dress shirt",
    "men's dress shirt",
    "wedding shirt"
  ],
  "expanded_keywords": [
    "men's red shirt",
    "formal red shirt",
    "wedding attire",
    "husband's dress shirt"
  ],
  "intent": "transactional",
  "translated_query": null,
  "iso_language_code": null,
  "facets": [
    {
      "name": "color",
      "values": [
        "red"
      ]
    },
    {
      "name": "gender",
      "values": [
        "men"
  

In [5]:
from IPython.display import Image
from IPython.core.display import HTML

def display_image_from_url(url: str):
    return Image(url=url)

# Display the image
display(HTML("<h3>Sample Image</h3>"))
display(display_image_from_url("https://upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Blue_sari_2.jpg/920px-Blue_sari_2.jpg"))


for query in image_query_annotation("https://upload.wikimedia.org/wikipedia/commons/thumb/d/d5/Blue_sari_2.jpg/920px-Blue_sari_2.jpg", "I'm looking for a kanchipuram silk saree"):
    print(query.model_dump_json(indent=2))

{
  "query": "kanchipuram silk saree",
  "simplified_query": "kanchipuram silk saree",
  "hypothetical_product_description": [
    "A luxurious Kanchipuram silk saree with intricate zari work, perfect for weddings and special occasions.",
    "Elegant Kanchipuram silk saree featuring traditional motifs and a rich pallu, ideal for festive celebrations.",
    "Classic Kanchipuram silk saree in vibrant colors with a contrasting border, suitable for formal events."
  ],
  "keywords": [
    "kanchipuram",
    "silk",
    "saree"
  ],
  "expanded_keywords": [
    "kanchipuram saree",
    "silk saree",
    "kanchipuram silk",
    "traditional saree",
    "wedding saree"
  ],
  "intent": "transactional",
  "translated_query": "kanchipuram silk saree",
  "iso_language_code": "en-US",
  "facets": [
    {
      "name": "material",
      "values": [
        "silk"
      ]
    },
    {
      "name": "type",
      "values": [
        "kanchipuram"
      ]
    }
  ]
}


In [6]:
# Pull 100 random queries from the df and display
def display_annotated_queries(queries: List[str]):
    for query in queries:
        annotated_queries = query_annotation(query)
        for annotated_query in annotated_queries:
            print(f"Query: {query}")
            print(f"Annotated Query: {annotated_query.model_dump_json(indent=2)}")
            print("\n")

seed = 69
queries = df["q_text"].sample(10, random_state=seed).tolist()
display_annotated_queries(queries)

Query: waterproof kid gloves small
Annotated Query: {
  "query": "waterproof kid gloves small",
  "simplified_query": "small waterproof gloves for kids",
  "hypothetical_product_description": [
    "These small waterproof gloves for kids are perfect for rainy days and outdoor adventures. Made with durable, water-resistant material, they keep little hands dry and comfortable.",
    "Designed for children, these small waterproof gloves offer excellent protection against the elements. Ideal for playing in the snow or rain, they ensure warmth and dryness.",
    "Keep your child's hands warm and dry with these small waterproof gloves. Perfect for winter sports or rainy weather, they are both functional and stylish."
  ],
  "keywords": [
    "waterproof",
    "kid gloves",
    "small"
  ],
  "expanded_keywords": [
    "children's waterproof gloves",
    "small size gloves for kids",
    "water-resistant gloves for children"
  ],
  "intent": "transactional",
  "translated_query": null,
  "iso