# 01 · Extract Clarification Data

Fetch menu-query clarification logs through the analytics API and cache the raw payload for downstream processing.

## Prerequisites
- FastAPI service is running with the analytics router enabled.
- Admin key available in `RAG_ADMIN_API_KEY` (matches backend `.env`).
- Optional filters via `ANALYTICS_START_AT`, `ANALYTICS_END_AT`, `ANALYTICS_FETCH_LIMIT`.


In [None]:
pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


Saved 6 rows to notebooks\menu_query_training\artifacts\clarifications_raw.parquet


WindowsPath('notebooks/menu_query_training/artifacts/clarifications_raw.parquet')

In [2]:
import os
from pathlib import Path

try:
    from dotenv import load_dotenv
except ImportError:
    load_dotenv = None

import pandas as pd
import requests

DATA_DIR = Path("notebooks/menu_query_training/artifacts")
DATA_DIR.mkdir(parents=True, exist_ok=True)

if load_dotenv:
    env_candidates = [
        Path.cwd() / ".env",
        Path.cwd().parent / ".env",
        Path.cwd().parent.parent / ".env",
        Path.cwd() / "chat-infrastructure" / "rag_service" / ".env",
        Path.cwd() / "be" / ".env"
    ]
    for env_path in env_candidates:
        if env_path.exists():
            load_dotenv(env_path, override=False)

BASE_URL = os.getenv("ANALYTICS_API_URL", "http://localhost:8081").rstrip('/')
ADMIN_KEY = os.getenv("RAG_ADMIN_API_KEY")
if not ADMIN_KEY:
    raise ValueError("RAG_ADMIN_API_KEY is required to call the analytics endpoint.")

params = {
    "limit": int(os.getenv("ANALYTICS_FETCH_LIMIT", "500")),
}
start_at = os.getenv("ANALYTICS_START_AT")
end_at = os.getenv("ANALYTICS_END_AT")
if start_at:
    params["start_at"] = start_at
if end_at:
    params["end_at"] = end_at

response = requests.get(
    f"{BASE_URL}/analytics/menu-query/clarifications",
    headers={"x-rag-admin-key": ADMIN_KEY},
    params=params,
    timeout=30
)
response.raise_for_status()
payload = response.json()
items = payload.get("items", [])
print(f"Fetched {len(items)} clarification rows")
if not items:
    raise ValueError("No clarifications returned; adjust filters or confirm data exists.")

df = pd.DataFrame(items)
df.head()


Fetched 6 clarification rows


Unnamed: 0,clarification_id,query_time,raw_query,normalized_query,tokens,intents,ambiguity_score,query_metadata,question_text,clarification_metadata,user_reply,resolved_intent,resolution_status,resolved_item_id,resolved_item_name
0,a0ea29ce-5422-4e87-8341-648cf46b3c82,2025-10-29T02:29:12,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...",0.978,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
1,4d24c2a3-5fbd-43fa-a180-8162ddfe8b20,2025-10-28T17:37:56,dishes today,dishes today,"[dishes, today]","{'spice': None, 'courses': [], 'temperature': ...",0.958,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Main Courses, Pasta...","{'options': ['Main Courses', 'Pasta & Risotto'...",Main Courses,freeform-input,CLARIFIED,9cd431c3-48be-459a-a325-574dad59174c,Seared Scallops
2,e35286d1-7d4c-42ae-8d18-311abfc0ccb3,2025-10-28T17:37:24,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...",0.978,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
3,98e40dc4-bae8-48fb-8709-af3bf0c49c7e,2025-10-28T17:23:25,I want some cold drinks,i want some cold drinks,"[cold, drinks]","{'spice': None, 'courses': ['beverage'], 'temp...",0.8,"{'available': True, 'tokenCount': 2, 'queryLen...",Would you like to focus on Beverages or Wine &...,"{'options': ['Beverages', 'Wine & Cocktails'],...",Wine & Cocktails,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz
4,b83cfad9-be67-4dc0-9f63-0d0a08b15b9d,2025-10-28T17:11:37,I want some cold drinks,i want some cold drinks,"[cold, drinks]","{'spice': None, 'courses': ['beverage'], 'temp...",0.8,"{'available': True, 'tokenCount': 2, 'queryLen...",Would you like to focus on Beverages or Wine &...,"{'options': ['Beverages', 'Wine & Cocktails'],...",with alcohol?,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz


In [3]:
import os
from pathlib import Path

import pandas as pd
import requests

DATA_DIR = Path("notebooks/menu_query_training/artifacts")
DATA_DIR.mkdir(parents=True, exist_ok=True)

BASE_URL = os.getenv("ANALYTICS_API_URL", "http://localhost:8081").rstrip('/')
ADMIN_KEY = os.getenv("RAG_ADMIN_API_KEY")
if not ADMIN_KEY:
    raise ValueError("RAG_ADMIN_API_KEY is required to call the analytics endpoint.")

params = {
    "limit": int(os.getenv("ANALYTICS_FETCH_LIMIT", "500")),
}
start_at = os.getenv("ANALYTICS_START_AT")
end_at = os.getenv("ANALYTICS_END_AT")
if start_at:
    params["start_at"] = start_at
if end_at:
    params["end_at"] = end_at

response = requests.get(
    f"{BASE_URL}/analytics/menu-query/clarifications",
    headers={"x-rag-admin-key": ADMIN_KEY},
    params=params,
    timeout=30
)
response.raise_for_status()
payload = response.json()
items = payload.get("items", [])
print(f"Fetched {len(items)} clarification rows")
if not items:
    raise ValueError("No clarifications returned; adjust filters or confirm data exists.")

df = pd.DataFrame(items)
df.head()


Fetched 6 clarification rows


Unnamed: 0,clarification_id,query_time,raw_query,normalized_query,tokens,intents,ambiguity_score,query_metadata,question_text,clarification_metadata,user_reply,resolved_intent,resolution_status,resolved_item_id,resolved_item_name
0,a0ea29ce-5422-4e87-8341-648cf46b3c82,2025-10-29T02:29:12,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...",0.978,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
1,4d24c2a3-5fbd-43fa-a180-8162ddfe8b20,2025-10-28T17:37:56,dishes today,dishes today,"[dishes, today]","{'spice': None, 'courses': [], 'temperature': ...",0.958,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Main Courses, Pasta...","{'options': ['Main Courses', 'Pasta & Risotto'...",Main Courses,freeform-input,CLARIFIED,9cd431c3-48be-459a-a325-574dad59174c,Seared Scallops
2,e35286d1-7d4c-42ae-8d18-311abfc0ccb3,2025-10-28T17:37:24,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...",0.978,"{'available': True, 'tokenCount': 2, 'queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
3,98e40dc4-bae8-48fb-8709-af3bf0c49c7e,2025-10-28T17:23:25,I want some cold drinks,i want some cold drinks,"[cold, drinks]","{'spice': None, 'courses': ['beverage'], 'temp...",0.8,"{'available': True, 'tokenCount': 2, 'queryLen...",Would you like to focus on Beverages or Wine &...,"{'options': ['Beverages', 'Wine & Cocktails'],...",Wine & Cocktails,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz
4,b83cfad9-be67-4dc0-9f63-0d0a08b15b9d,2025-10-28T17:11:37,I want some cold drinks,i want some cold drinks,"[cold, drinks]","{'spice': None, 'courses': ['beverage'], 'temp...",0.8,"{'available': True, 'tokenCount': 2, 'queryLen...",Would you like to focus on Beverages or Wine &...,"{'options': ['Beverages', 'Wine & Cocktails'],...",with alcohol?,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz


In [5]:
import json

# Ensure pandas and Path are available without re-importing if already present
if 'pd' not in globals():
    import pandas as pd

from pathlib import Path

# Ensure DATA_DIR exists (do not overwrite if already defined in other cells)
if 'DATA_DIR' not in globals():
    DATA_DIR = Path("notebooks/menu_query_training/artifacts")
    DATA_DIR.mkdir(parents=True, exist_ok=True)

# If df is not present in the notebook, try loading the cached parquet as a fallback
if 'df' not in globals():
    parquet_path = DATA_DIR / 'clarifications_raw.parquet'
    if parquet_path.exists():
        try:
            df = pd.read_parquet(parquet_path)
            print(f"Loaded dataframe from {parquet_path}")
        except Exception as e:
            raise RuntimeError(f"Failed to read {parquet_path}: {e}")
    else:
        raise RuntimeError(
            "No dataframe 'df' found in the notebook and clarifications_raw.parquet does not exist. "
            "Run the extraction cell first to populate the dataframe or ensure the parquet file is present."
        )

def _to_serializable(value):
    if isinstance(value, (dict, list)):
        return json.dumps(value, ensure_ascii=False)
    return value

serializable_df = df.copy()
for column in serializable_df.columns:
    if serializable_df[column].dtype == 'object':
        if serializable_df[column].apply(lambda x: isinstance(x, (dict, list))).any():
            serializable_df[column] = serializable_df[column].apply(_to_serializable)

output_path = DATA_DIR / 'clarifications_raw.parquet'
serializable_df.to_parquet(output_path, index=False)
print(f'Saved {len(serializable_df)} rows to {output_path}')
output_path


Saved 6 rows to notebooks\menu_query_training\artifacts\clarifications_raw.parquet


WindowsPath('notebooks/menu_query_training/artifacts/clarifications_raw.parquet')

In [None]:
pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-22.0.0-cp313-cp313-win_amd64.whl.metadata (3.3 kB)
Downloading pyarrow-22.0.0-cp313-cp313-win_amd64.whl (28.0 MB)
   ---------------------------------------- 0.0/28.0 MB ? eta -:--:--
   - -------------------------------------- 0.8/28.0 MB 10.7 MB/s eta 0:00:03
   --------- ------------------------------ 6.8/28.0 MB 38.0 MB/s eta 0:00:01
   ------------------------------- -------- 21.8/28.0 MB 46.2 MB/s eta 0:00:01
   ---------------------------------------- 28.0/28.0 MB 45.2 MB/s  0:00:00
Installing collected packages: pyarrow
Successfully installed pyarrow-22.0.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install fastparquet

Note: you may need to restart the kernel to use updated packages.


In [6]:
output_path = DATA_DIR / "clarifications_raw.parquet"
df.to_parquet(output_path, index=False)
print(f"Saved {len(df)} rows to {output_path}")
output_path


Saved 6 rows to notebooks\menu_query_training\artifacts\clarifications_raw.parquet


WindowsPath('notebooks/menu_query_training/artifacts/clarifications_raw.parquet')