# 01 Â· Extract Clarification Data

Fetch menu-query clarification logs via the analytics API and store the raw payload for downstream notebooks.

## Usage
- Ensure the FastAPI service is running with `/analytics/menu-query/clarifications` enabled.
- Provide `RAG_ADMIN_API_KEY` (and optionally `ANALYTICS_API_URL`, `ANALYTICS_FETCH_LIMIT`, `ANALYTICS_START_AT`, `ANALYTICS_END_AT`).
- The notebook resolves the canonical `chat-infrastructure/rag_service/notebooks/menu_query_training/artifacts` directory regardless of where Jupyter was launched from.


In [1]:
import os
from pathlib import Path

try:
    from dotenv import load_dotenv
except ImportError:
    load_dotenv = None

import pandas as pd
import requests

def resolve_notebook_dir() -> Path:
    search_roots = [Path.cwd(), *Path.cwd().parents]
    candidates = []
    for root in search_roots:
        candidates.append(root / "notebooks" / "menu_query_training")
        candidates.append(root / "chat-infrastructure" / "rag_service" / "notebooks" / "menu_query_training")
    for candidate in candidates:
        if candidate.exists():
            return candidate
    # Fallback: create the canonical path under chat-infrastructure/rag_service
    return Path.cwd() / "chat-infrastructure" / "rag_service" / "notebooks" / "menu_query_training"

NOTEBOOK_DIR = resolve_notebook_dir()
DATA_DIR = NOTEBOOK_DIR / "artifacts"
DATA_DIR.mkdir(parents=True, exist_ok=True)

if load_dotenv:
    env_seen = set()
    for root in [NOTEBOOK_DIR, *NOTEBOOK_DIR.parents]:
        env_path = root / ".env"
        if env_path.exists() and env_path not in env_seen:
            load_dotenv(env_path, override=False)
            env_seen.add(env_path)

BASE_URL = os.getenv("ANALYTICS_API_URL", "http://localhost:8081").rstrip('/')
ADMIN_KEY = os.getenv("RAG_ADMIN_API_KEY")
if not ADMIN_KEY:
    raise ValueError("RAG_ADMIN_API_KEY is required to call the analytics endpoint.")

params = {
    "limit": int(os.getenv("ANALYTICS_FETCH_LIMIT", "500")),
}
start_at = os.getenv("ANALYTICS_START_AT")
end_at = os.getenv("ANALYTICS_END_AT")
if start_at:
    params["start_at"] = start_at
if end_at:
    params["end_at"] = end_at

response = requests.get(
    f"{BASE_URL}/analytics/menu-query/clarifications",
    headers={"x-rag-admin-key": ADMIN_KEY},
    params=params,
    timeout=30
)
response.raise_for_status()
payload = response.json()
items = payload.get("items", [])
print(f"Fetched {len(items)} clarification rows")
if not items:
    raise ValueError("No clarifications returned; adjust filters or confirm data exists.")

df = pd.DataFrame(items)
print(f"Notebook directory: {NOTEBOOK_DIR}")
df.head()


Fetched 8 clarification rows
Notebook directory: e:\Omnichannel\Omnichannel\chat-infrastructure\rag_service\notebooks\menu_query_training


Unnamed: 0,clarification_id,query_time,raw_query,normalized_query,tokens,intents,ambiguity_score,query_metadata,question_text,clarification_metadata,user_reply,resolved_intent,resolution_status,resolved_item_id,resolved_item_name
0,ec9d7853-8fb4-42e3-b9e9-1605e55541b9,2025-10-29T06:33:42,Need gluten-free options how about drink ?,need gluten-free options how about drink ?,"[need, gluten, free, options, how, about, drink]","{'spice': None, 'courses': ['beverage'], 'temp...",0.997,"{'available': True, 'tokenCount': 7, 'queryLen...",Would you like to focus on Wine & Cocktails or...,"{'options': ['Wine & Cocktails', 'Beverages'],...",Beverages,freeform-input,CLARIFIED,7f10fc4b-1749-4708-85de-ee4edba3935d,Fresh Orange Juice
1,2cc72e88-1c50-48ff-b320-ee0b248c8a83,2025-10-29T06:33:26,Need gluten-free options,need gluten-free options,"[need, gluten, free, options]","{'spice': None, 'courses': [], 'temperature': ...",0.997,"{'available': True, 'tokenCount': 4, 'queryLen...","Would you like to focus on Main Courses, Desse...","{'options': ['Main Courses', 'Desserts', 'Wine...",how about drink ?,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz
2,a0ea29ce-5422-4e87-8341-648cf46b3c82,2025-10-29T02:29:12,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...",0.978,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
3,4d24c2a3-5fbd-43fa-a180-8162ddfe8b20,2025-10-28T17:37:56,dishes today,dishes today,"[dishes, today]","{'spice': None, 'courses': [], 'temperature': ...",0.958,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Main Courses, Pasta...","{'options': ['Main Courses', 'Pasta & Risotto'...",Main Courses,freeform-input,CLARIFIED,9cd431c3-48be-459a-a325-574dad59174c,Seared Scallops
4,e35286d1-7d4c-42ae-8d18-311abfc0ccb3,2025-10-28T17:37:24,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...",0.978,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup


In [2]:
import json

if 'df' not in globals():
    raise RuntimeError('Run the extraction cell first to populate the dataframe.')

def _to_serializable(value):
    if isinstance(value, (dict, list)):
        return json.dumps(value, ensure_ascii=False)
    return value

serializable_df = df.copy()
for column in serializable_df.columns:
    if serializable_df[column].dtype == 'object':
        if serializable_df[column].apply(lambda x: isinstance(x, (dict, list))).any():
            serializable_df[column] = serializable_df[column].apply(_to_serializable)

output_path = DATA_DIR / "clarifications_raw.parquet"
serializable_df.to_parquet(output_path, index=False)
print(f"Saved {len(serializable_df)} rows to {output_path}")
output_path


Saved 8 rows to e:\Omnichannel\Omnichannel\chat-infrastructure\rag_service\notebooks\menu_query_training\artifacts\clarifications_raw.parquet


WindowsPath('e:/Omnichannel/Omnichannel/chat-infrastructure/rag_service/notebooks/menu_query_training/artifacts/clarifications_raw.parquet')