In [1]:
%cd ..

/Users/shivamkaushik/Code/ik-agent


In [2]:
import pandas as pd

In [31]:
df = pd.read_csv("static/ik_transforms.csv")
df.tags = df.tags.apply(lambda x: [xx.strip() for xx in x.split(",")])
df.head()

Unnamed: 0,parameter_name,ik_key,value_type,allowed_values,constraints,tags,used_in,description,example_usage
0,width,width,"number, arithmetic_expression, """"""auto""""""",integer > 1 (px) OR 0 < w < 1 (percentage) OR ...,Auto width from Client Hint Sec-CH-Width.,"[resize, crop]",image,"Output width. If only w is provided, height au...",### Example 1\n\n**Integer value**\n\n```pytho...
1,height,height,"number, arithmetic_expression, """"""auto""""""",integer > 1 (px) OR 0 < w < 1 (percentage) OR ...,,[resize],image,"Output height. If only h is provided, width au...",### Example 1\n\n**Integer value**\n\n```pytho...
2,Aspect Ratio,aspect_ratio,"<w>-<h>, arithmetic_expression","width-height, arithmetic expression",Must be used with either w or h. Ignored if bo...,[resize],image,Aspect ratio (width:height).,"### Example 1\n\n**width = 400, height = 300, ..."
3,Crop Mode,crop_mode,enum,pad_resize | pad_extract | extract,pad_resize\n\n* Requires **both `w` and `h`** ...,"[resize, crop]",image,`cm-pad_resize` resizes the image to fit withi...,## pad_resize\n\n### Example: Equal padding on...
4,Crop,crop,enum,force | at_max_enlarge | at_least | maintain_r...,## force (`force`)\n\n* Requires **both `w` an...,[crop],image,**force (`force`)**\nResizes the image to the ...,---\n\n## force (`force`)\n\n### This example ...


In [32]:
unique_tags = set()
for t in df.tags.values:
    unique_tags.update(t)

unique_tags

{'crop', 'resize', 'smart_crop'}

In [None]:
from dotenv import load_dotenv
from openai import AsyncOpenAI

load_dotenv()

client = AsyncOpenAI()


In [None]:
import json

SMALL_LLM_PROMPT_TEMPLATE = """
You are a strict classifier.

Your task:
Given a user query, identify:
1. Which ImageKit transformation METHODS are required
2. Which semantic TAGS are relevant

Rules:
- You MUST choose tags ONLY from the provided list
- You MUST choose methods ONLY from the provided list
- Do NOT invent new tags or methods
- Do NOT generate parameter values
- Do NOT generate ImageKit keys

Valid methods:
{methods_json}

Valid tags:
{tags_json}

Output STRICT JSON only.

Format:
{{
  "methods": ["method_name"],
  "tags": ["tag1", "tag2"]
}}

User query:
{user_query}
""".strip()


def build_tag_block(tags: set[str]) -> str:
    """
    Render tags in a strict, LLM-friendly format.
    """
    return json.dumps(sorted(tags), indent=2)


def build_small_llm_prompt(
    user_query: str,
    valid_methods: list[str],
    valid_tags: set[str],
) -> str:
    return SMALL_LLM_PROMPT_TEMPLATE.format(
        user_query=user_query,
        methods_json=json.dumps(valid_methods, indent=2),
        tags_json=json.dumps(sorted(valid_tags), indent=2),
    )


In [None]:
user_query = "user_query"

prompt = build_small_llm_prompt(
    user_query=user_query,
    valid_methods=[
        "resize_and_crop",
        "image_overlay",
        "text_overlay",
    ],
    valid_tags=unique_tags,
)
print(prompt)

You are a strict classifier.

Your task:
Given a user query, identify:
1. Which ImageKit transformation METHODS are required
2. Which semantic TAGS are relevant

Rules:
- You MUST choose tags ONLY from the provided list
- You MUST choose methods ONLY from the provided list
- Do NOT invent new tags or methods
- Do NOT generate parameter values
- Do NOT generate ImageKit keys

Valid methods:
[
  "resize_and_crop",
  "image_overlay",
  "text_overlay"
]

Valid tags:
[
  "crop",
  "resize",
  "smart_crop"
]

Output STRICT JSON only.

Format:
{
  "methods": ["method_name"],
  "tags": ["tag1", "tag2"]
}

User query:
user_query


In [None]:
import json
from typing import Dict, List


async def small_llm_filter(user_query: str) -> Dict[str, List[str]]:
    """
    Uses a small, cheap model for intent classification.
    Returns:
      {
        "methods": [...],
        "tags": [...]
      }
    """

    prompt = build_small_llm_prompt(
        user_query=user_query,
        valid_methods=[
            "resize_and_crop",
            "image_overlay",
            "text_overlay",
        ],
        valid_tags=unique_tags,
    )

    response = await client.chat.completions.create(
        model="gpt-4o-mini",  # cheap + fast
        messages=[
            {"role": "system", "content": "You are a strict JSON-only classifier."},
            {"role": "user", "content": prompt},
        ],
        temperature=0,
    )

    return json.loads(response.choices[0].message.content)


In [28]:
user_query = "resize image to 300x300 and add padding of 50 pixels"

output = await small_llm_filter(user_query)

In [None]:
# def filter_csv_rows(
#     df,
#     methods: List[str],
#     tags: List[str],
# ) -> Dict[str, List[dict]]:
#     """
#     Filters CSV rows based on used_in and tags.
#     Groups results by method.
#     """

#     result: Dict[str, List[dict]] = {}

#     for row in rows:
#         if row["used_in"] not in methods:
#             continue

#         row_tags = [t.strip() for t in row["tags"].split(",")]
#         if not any(tag in row_tags for tag in tags):
#             continue

#         result.setdefault(row["used_in"], []).append(row)

#     return result


In [None]:
cond = df.tags.apply(lambda x: len(set(output["tags"]).intersection(set(x))) > 0)

In [None]:
filtered_metadata = df[cond].to_dict(orient="records")

In [None]:
BIG_LLM_PROMPT = """
You are an ImageKit transformation generator.

You are given:
1. A user query
2. A list of VALID parameters for a specific method
   (including constraints, allowed values, and examples)

Rules:
- Use ONLY the provided parameters
- Use parameter_name (NOT ImageKit short keys)
- Do NOT invent parameters
- Do NOT invent methods
- Do NOT include null or unused parameters
- Output MUST be valid JSON
- Output MUST match the schema exactly

Schema:
[
  {
    "method": "<method_name>",
    "params": { "<parameter_name>": <value> }
  }
]

If the query cannot be satisfied using the provided parameters,
return an empty array [].

User query:
{{USER_QUERY}}

Allowed parameters metadata:
{{FILTERED_PARAMETER_METADATA}}
""".strip()


async def big_llm_generate(
    user_query: str,
    filtered_metadata: Dict[str, List[dict]],
) -> List[dict]:
    """
    Generates [{method, params}] using filtered metadata.
    """

    prompt = BIG_LLM_PROMPT.replace("{{USER_QUERY}}", user_query).replace(
        "{{FILTERED_PARAMETER_METADATA}}", json.dumps(filtered_metadata, indent=2)
    )

    response = await client.chat.completions.create(
        model="gpt-4.1",  # stronger reasoning
        messages=[
            {"role": "system", "content": "You output valid JSON only."},
            {"role": "user", "content": prompt},
        ],
        temperature=0,
    )

    return json.loads(response.choices[0].message.content)


In [40]:
output = await big_llm_generate(
    user_query=user_query,
    filtered_metadata=filtered_metadata,
)

In [41]:
output

[{'method': 'resize_and_crop',
  'params': {'width': 300, 'height': 300, 'crop_mode': 'pad_resize'}}]

In [44]:
import json
import os

from typing import Any, Dict, List, Optional
from strands import tool
from src.utils.utils import (
    embed_query,
    detect_sources,
    get_query_keywords_using_model,
    maybe_filter,
)

from src.utils.utils import ImagekitInformationSource
from src.config import TYPESENSE_CLIENT, TYPESENSE_MODEL_PAYLOAD

('imagekit_guides', 'imagekit_community')

In [48]:
async def search_docs(
    *,
    query: str,
    sources: Optional[List[str]] = None,
    conversation_id: Optional[str] = None,
) -> Dict[str, Any]:
    """
    Run a Typesense semantic chat query grounded in ImageKit docs.
    """
    sources = [
        ImagekitInformationSource.ImagekitGuides.value,
        ImagekitInformationSource.ImagekitCommunity.value,
    ]
    keywords = await get_query_keywords_using_model(query)
    enriched_query = f"{query}, Keywords: {', '.join(keywords)}" if keywords else query

    vector = await embed_query(enriched_query)
    embed_str = json.dumps(vector, separators=(",", ":"))

    search_params = {
        "collection": os.getenv("TYPESENSE_COLLECTION", ""),
        "query_by": (
            "section_content,summary,page_description,keywords,"
            "lvl0,lvl1,lvl2,lvl3,lvl4,lvl5,lvl6"
        ),
        "query_by_weights": "3,2,2,1,1,1,1,1,1,1,1",
        "vector_query": f"content_embedding:({embed_str},k:60)",
        "limit": 10,
        "rerank_hybrid_matches": True,
        "exclude_fields": "content_embedding",
        "filter_by": f"source:={sources}",
    }

    common_params: Dict[str, Any] = {
        "q": enriched_query,
        "conversation": False,
        "conversation_model_id": TYPESENSE_MODEL_PAYLOAD["id"],
    }
    if conversation_id:
        common_params["conversation_id"] = conversation_id

    payload = {"searches": [search_params]}
    return TYPESENSE_CLIENT.multi_search.perform(payload, common_params=common_params)


In [108]:
results = await search_docs(query="video smart crop")

In [None]:
from typing import List, Tuple


def group_search_results(search_results: List[Tuple[str, float, dict]]):
    """
    Organize fused search results into a dictionary grouped by source_url.

    Structure:
        {
            source_url: {
                "page_title": str,
                "page_description": str,
                "content": str,  # concatenated section text with summaries
            },
            ...
        }

    Parameters
    ----------
    fused : list of tuples
        [(doc_id, score, doc_dict), ...]

    Returns
    -------
    dict
        Organized results grouped by source_url.
    """

    # Sort by file path and line order (ascending)
    # results['results'][0]['hits']
    fused_sorted = sorted(
        search_results["results"][0]["hits"],
        key=lambda x: (x.get("hybrid_search_info").get("rank_fusion_score")),
        reverse=True,
    )
    fused_sorted = sorted(
        fused_sorted,
        key=lambda x: (
            x.get("source_url", ""),
            x.get("line_start", float("inf")),
        ),
    )

    final_docs: dict[str, dict] = {}

    for doc in fused_sorted:
        doc = doc["document"]
        source_url = doc.get("source_url")
        if not source_url:
            # Skip if no source_url (invalid record)
            continue

        # Initialize file-level structure if not already present
        file_entry = final_docs.setdefault(
            source_url,
            {
                "page_title": doc.get("lvl0", ""),
                "page_description": doc.get("page_description", ""),
                "content": "",
            },
        )

        # Build breadcrumb from lvl1–lvl6 hierarchy
        breadcrumb = " > ".join(
            doc.get(f"lvl{x}") for x in range(1, 7) if doc.get(f"lvl{x}")
        )

        # Compose formatted section block
        section_content = doc.get("section_content", "").strip()
        summary = doc.get("summary", "").strip()

        section_block = (
            f"\n"
            f"## {breadcrumb or '(No Section Title)'}\n"
            f"**Summary:** {summary or '(No summary)'}\n\n"
            f"{section_content}\n"
            f"---\n"
        )

        # Append section block to this page’s content
        file_entry["content"] += section_block

    return final_docs

In [115]:
search_results = group_search_results(results)
# results['results'][0]['hits'][0]

In [116]:
search_results

{'https://imagekit.io/docs/video-resize-and-crop': {'page_title': 'Resize and Crop Videos',
  'page_description': 'Learn how to resize and crop videos using the ImageKit.io URL-based transformation parameters.',
 'https://imagekit.io/docs/video-transformation': {'page_title': 'Video Transformation',
  'page_description': 'Learn how to transform videos using the ImageKit.io URL-based transformation parameters.',
  'content': '\n## Pricing\n**Summary:** This section defines how Video Processing Units (VPUs) are calculated for video transformations based on output duration, resolution, and codec, including formulas, per-second unit rates, and a resolution classification table, plus special rules for audio extraction, adaptive bitrate streaming, thumbnails, and smart crop. It matters because it explains exactly which transformations incur processing costs and how to estimate or optimize billing impact.\n\nEvery new video transformation that has never been done before will contribute toward

In [159]:
DOC_PARAM_EXTRACTION_PROMPT = """
You are an ImageKit documentation interpreter.

Your task:
Given:
1. A user query
2. Documentation search results (raw content from ImageKit docs)

Extract ONLY the transformation parameters that are:
- Explicitly supported by the documentation
- Directly relevant to the user query

--------------------------------
STRICT RULES (VERY IMPORTANT)
--------------------------------
- Extract ONLY parameters that appear explicitly in the documentation
- Do NOT invent parameters
- Do NOT invent default values
- Do NOT output ImageKit short keys
- Do NOT include pricing, cost formulas, or explanations
- Do NOT include parameters that are restricted or incompatible
- Respect documented limitations and ordering rules
- If the query cannot be fulfilled reliably, return an empty list

--------------------------------
OUTPUT FORMAT (STRICT JSON ONLY)
--------------------------------
["param-value", ....]

--------------------------------
USER QUERY
--------------------------------
{user_query}

--------------------------------
DOCUMENTATION CONTEXT
--------------------------------
{doc_context}
""".strip()


In [160]:
def flatten_search_docs(search_docs_result: Dict[str, Any]) -> str:
    """
    Convert search_docs tool output into a readable text block
    for LLM consumption.
    """
    blocks = []

    for url, data in search_docs_result.items():
        blocks.append(
            f"""
SOURCE: {url}
TITLE: {data.get("page_title")}
DESCRIPTION: {data.get("page_description")}

CONTENT:
{data.get("content")}
""".strip()
        )

    return "\n\n---\n\n".join(blocks)


In [161]:
async def extract_params_from_docs(
    *,
    user_query: str,
    search_docs_result: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Extract ImageKit transformation parameters from documentation context.

    This function is intended to be used ONLY as a fallback when
    CSV/schema-based transformation planning fails.

    Parameters
    ----------
    user_query : str
        Original user query (e.g. "video-smart-crop for face-crop")

    search_docs_result : dict
        Output from the search_docs tool:
        {
          "url": {
              "page_title": "...",
              "page_description": "...",
              "content": "..."
          }
        }

    Returns
    -------
    dict
        {
          "params": { ... }
        }
        or {} if nothing can be extracted safely.
    """

    doc_context = flatten_search_docs(search_docs_result)
    # print(doc_context)
    prompt = DOC_PARAM_EXTRACTION_PROMPT.format(
        user_query=user_query,
        doc_context=doc_context,
    )

    response = await client.chat.completions.create(
        model="gpt-4.1",
        messages=[
            {
                "role": "system",
                "content": "You extract parameters strictly from documentation and output JSON only.",
            },
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature=0,
    )

    content = response.choices[0].message.content.strip()

    try:
        return json.loads(content)
    except json.JSONDecodeError:
        # Absolute safety: never propagate malformed output
        return {}


In [None]:
query = "video-smart-crop for face-crop for 300x300"
results = await search_docs(query=query)
search_results = group_search_results(results)

# params = await extract_params_from_docs(
#     user_query=query,
#     search_docs_result=search_results,
# )


In [166]:
params = await extract_params_from_docs(
    user_query=query,
    search_docs_result=search_results,
)

In [168]:
params

['fo-face', 'w-300', 'h-300']

In [None]:
from src.tools.transformations.transformation import (
    load_transform_metadata,
    extract_unique_tags,
    small_llm_filter,
    filter_metadata,
    big_llm_generate,
    search_docs,
    group_search_results,
    extract_params_from_docs,
    resolve_imagekit_transform
)

  return meta(
  return meta(


In [3]:
df = load_transform_metadata("static/ik_transforms.csv")
unique_tags = extract_unique_tags(df)

user_query="resize image to 300x300 and add padding of 50 pixels, add image overlays"

output = await small_llm_filter(
    user_query=user_query,
    valid_methods=["resize_and_crop"],
    valid_tags=unique_tags,
)
print(output)

{'methods': ['resize_and_crop'], 'tags': ['resize'], 'unresolved_intent': 'search for methods to add padding and image overlays'}


In [4]:
filtered_metadata = filter_metadata(
    df=df,
    tags=output["tags"],
)

In [5]:
structured_plan = await big_llm_generate(
    user_query=user_query,
    filtered_metadata=filtered_metadata,
)

In [8]:
if (not structured_plan) or output.get("unresolved_intent"):
    raw_results = await search_docs(query=output.get("unresolved_intent"))
    grouped = group_search_results(raw_results)
    doc_params = await extract_params_from_docs(
        user_query=user_query,
        search_docs_result=grouped,
    )

In [9]:
doc_params

{'params': {'w': '300', 'h': '300', 'cm': 'pad_resize'}}

In [10]:
structured_plan

[{'method': 'resize_and_crop',
  'params': {'width': 300, 'height': 300, 'crop_mode': 'pad_resize'}}]

In [15]:
from src.utils.utils import get_transform_key

final_params_for_transformation = []
if structured_plan:
    normal_transformations = {}
    overlay_transformations = []
    for transformation in structured_plan:
        if "overlay" not in transformation.get("method"):
            normal_transformations.update(transformation.get("params"))
        else:
            overlay_transformations.append(transformation.get("params"))
    
    final_params_for_transformation.append(normal_transformations)
    final_params_for_transformation.extend(overlay_transformations)

    if doc_params:
        for p, v in doc_params.get('params').items():
            k = get_transform_key(p)
            final_params_for_transformation[0][k] = v

else:
    if doc_params:
        out = {}
        for p, v in doc_params.get('params').items():
            k = get_transform_key(p)
            out[k] = v
        final_params_for_transformation.append(out)

In [16]:
final_params_for_transformation

[{'width': '300', 'height': '300', 'crop_mode': 'pad_resize'}]

In [4]:
user_query = "resize image to 300x300 and add focus on face"
transformation = await resolve_imagekit_transform(
    user_query=user_query,
    csv_path="static/ik_transforms.csv",
)

In [5]:
transformation

[{'width': '300', 'height': '300', 'focus': 'face'}]

In [None]:
from src.modules.ik_transforms.transforms.resize_n_crop import ResizeAndCropTransforms

print(ResizeAndCropTransforms.resize_and_crop.__doc__)


        Validate and normalize ImageKit resize & crop parameters.

        Parameters
        ----------
        width : int | float | str, optional
            Output width. Accepts:
            - int/float > 0
            - string tokens like "auto"
            - arithmetic expressions as strings (passed through)

        height : int | float | str, optional
            Output height. Same acceptance as width.

        aspect_ratio : str, optional
            Aspect ratio as "<w>-<h>" (e.g. "16-9") or an arithmetic expression string.

        crop : {"force","at_max_enlarge","at_least","maintain_ratio"}, optional
            Default: "maintain_ratio"
            Resize/crop strategy. Important:
            - When crop='force', focus and zoom are not allowed.

        crop_mode : {"pad_resize","pad_extract","extract"}, optional
            Crop mode controlling padding/extraction.
            - Coordinates (x,y,x_center,y_center) are ONLY allowed with crop_mode in {"extract","pad_ext