# 02 Â· Prepare Training Dataset

Flatten the raw clarification export into model-ready features and labels.


In [None]:
from pathlib import Path
import json

import pandas as pd

def resolve_notebook_dir() -> Path:
    search_roots = [Path.cwd(), *Path.cwd().parents]
    candidates = []
    for root in search_roots:
        candidates.append(root / "notebooks" / "menu_query_training")
        candidates.append(root / "chat-infrastructure" / "rag_service" / "notebooks" / "menu_query_training")
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return Path.cwd() / "chat-infrastructure" / "rag_service" / "notebooks" / "menu_query_training"

NOTEBOOK_DIR = resolve_notebook_dir()
DATA_DIR = NOTEBOOK_DIR / "artifacts"
DATA_DIR.mkdir(parents=True, exist_ok=True)

raw_path = DATA_DIR / "clarifications_raw.parquet"
if not raw_path.exists():
    raise FileNotFoundError(f"Missing raw extract at {raw_path}. Run 01_extract_data.ipynb first.")

df = pd.read_parquet(raw_path)
print(f"Loaded {len(df)} rows from {raw_path}")
print (df)


Loaded 8 rows from e:\Omnichannel\Omnichannel\chat-infrastructure\rag_service\notebooks\menu_query_training\artifacts\clarifications_raw.parquet


Unnamed: 0,clarification_id,query_time,raw_query,normalized_query,tokens,intents,ambiguity_score,query_metadata,question_text,clarification_metadata,user_reply,resolved_intent,resolution_status,resolved_item_id,resolved_item_name
0,ec9d7853-8fb4-42e3-b9e9-1605e55541b9,2025-10-29T06:33:42,Need gluten-free options how about drink ?,need gluten-free options how about drink ?,"[""need"", ""gluten"", ""free"", ""options"", ""how"", ""...","{""spice"": null, ""courses"": [""beverage""], ""temp...",0.997,"{""available"": true, ""tokenCount"": 7, ""queryLen...",Would you like to focus on Wine & Cocktails or...,"{""options"": [""Wine & Cocktails"", ""Beverages""],...",Beverages,freeform-input,CLARIFIED,7f10fc4b-1749-4708-85de-ee4edba3935d,Fresh Orange Juice
1,2cc72e88-1c50-48ff-b320-ee0b248c8a83,2025-10-29T06:33:26,Need gluten-free options,need gluten-free options,"[""need"", ""gluten"", ""free"", ""options""]","{""spice"": null, ""courses"": [], ""temperature"": ...",0.997,"{""available"": true, ""tokenCount"": 4, ""queryLen...","Would you like to focus on Main Courses, Desse...","{""options"": [""Main Courses"", ""Desserts"", ""Wine...",how about drink ?,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz
2,a0ea29ce-5422-4e87-8341-648cf46b3c82,2025-10-29T02:29:12,Any vegetarian pasta?,any vegetarian pasta?,"[""vegetarian"", ""pasta""]","{""spice"": null, ""courses"": [""pasta""], ""tempera...",0.978,"{""available"": true, ""tokenCount"": 2, ""queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{""options"": [""Pasta & Risotto"", ""Pizza"", ""Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
3,4d24c2a3-5fbd-43fa-a180-8162ddfe8b20,2025-10-28T17:37:56,dishes today,dishes today,"[""dishes"", ""today""]","{""spice"": null, ""courses"": [], ""temperature"": ...",0.958,"{""available"": true, ""tokenCount"": 2, ""queryLen...","Would you like to focus on Main Courses, Pasta...","{""options"": [""Main Courses"", ""Pasta & Risotto""...",Main Courses,freeform-input,CLARIFIED,9cd431c3-48be-459a-a325-574dad59174c,Seared Scallops
4,e35286d1-7d4c-42ae-8d18-311abfc0ccb3,2025-10-28T17:37:24,Any vegetarian pasta?,any vegetarian pasta?,"[""vegetarian"", ""pasta""]","{""spice"": null, ""courses"": [""pasta""], ""tempera...",0.978,"{""available"": true, ""tokenCount"": 2, ""queryLen...","Would you like to focus on Pasta & Risotto, Pi...","{""options"": [""Pasta & Risotto"", ""Pizza"", ""Soup...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup


In [2]:
def ensure_list(value):
    if value is None:
        return []
    if isinstance(value, (list, tuple)):
        return list(value)
    if hasattr(value, 'tolist'):
        try:
            return list(value.tolist())
        except Exception:
            pass
    if isinstance(value, str):
        try:
            parsed = json.loads(value)
            if isinstance(parsed, list):
                return parsed
            if isinstance(parsed, dict):
                return list(parsed.values())
            return [parsed]
        except json.JSONDecodeError:
            return [value]
    if isinstance(value, dict):
        return list(value.values())
    return [value]

def ensure_dict(value):
    if value is None:
        return {}
    if isinstance(value, dict):
        return value
    if hasattr(value, 'items'):
        try:
            return dict(value)
        except Exception:
            pass
    if isinstance(value, str):
        try:
            parsed = json.loads(value)
            return parsed if isinstance(parsed, dict) else {}
        except json.JSONDecodeError:
            return {}
    if isinstance(value, (list, tuple)):
        return {str(i): item for i, item in enumerate(value)}
    if hasattr(value, 'tolist'):
        try:
            seq = value.tolist()
            if isinstance(seq, dict):
                return seq
            return {str(i): item for i, item in enumerate(seq)}
        except Exception:
            pass
    return {}

df["tokens_list"] = df["tokens"].apply(ensure_list)
df["intents_dict"] = df["intents"].apply(ensure_dict)
df["clarification_dict"] = df["clarification_metadata"].apply(ensure_dict)

df[["tokens_list", "intents_dict", "clarification_dict"]].head()


Unnamed: 0,tokens_list,intents_dict,clarification_dict
0,"[need, gluten, free, options, how, about, drink]","{'spice': None, 'courses': ['beverage'], 'temp...","{'options': ['Wine & Cocktails', 'Beverages'],..."
1,"[need, gluten, free, options]","{'spice': None, 'courses': [], 'temperature': ...","{'options': ['Main Courses', 'Desserts', 'Wine..."
2,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup..."
3,"[dishes, today]","{'spice': None, 'courses': [], 'temperature': ...","{'options': ['Main Courses', 'Pasta & Risotto'..."
4,"[vegetarian, pasta]","{'spice': None, 'courses': ['pasta'], 'tempera...","{'options': ['Pasta & Risotto', 'Pizza', 'Soup..."


In [3]:
option_sets = df["clarification_dict"].apply(lambda x: ensure_list(x.get("options")))

df["token_count"] = df["tokens_list"].apply(len)
df["has_answer_in_options"] = [1 if (str(reply).strip().lower() in [str(opt).strip().lower() for opt in opts]) else 0 for reply, opts in zip(df["user_reply"], option_sets)]
df["answer_length"] = df["user_reply"].fillna("").astype(str).str.len()

def coerce_intent_value(val):
    if isinstance(val, (bool, int, float)):
        return int(bool(val))
    if isinstance(val, (list, tuple, set)):
        return int(len(val) > 0)
    if isinstance(val, dict):
        return int(len(val) > 0)
    if val is None:
        return 0
    text = str(val).strip()
    return int(bool(text))

intent_keys = sorted({key for intents in df["intents_dict"] for key in intents.keys()})
for key in intent_keys:
    df[f"intent_{key}"] = df["intents_dict"].apply(lambda d, k=key: coerce_intent_value(d.get(k)))

feature_cols = [
    "token_count",
    "has_answer_in_options",
    "answer_length",
    *[f"intent_{key}" for key in intent_keys]
]
df[feature_cols].head()


Unnamed: 0,token_count,has_answer_in_options,answer_length,intent_alcoholPreference,intent_avoidAllergens,intent_courses,intent_ingredientFocus,intent_requireDietary,intent_spice,intent_temperature
0,7,1,9,0,1,1,0,0,0,0
1,4,0,17,0,1,0,0,0,0,0
2,2,1,14,0,0,1,1,1,0,0
3,2,1,12,0,0,0,0,0,0,0
4,2,1,14,0,0,1,1,1,0,0


In [4]:
df["label_resolved"] = df["resolution_status"].eq("CLARIFIED").astype(int)
df["label_resolved"].value_counts(dropna=False)


label_resolved
1    8
Name: count, dtype: int64

In [5]:
dataset_path = DATA_DIR / "clarifications_model_ready.parquet"
save_cols = [
    "clarification_id",
    "query_time",
    "raw_query",
    "user_reply",
    "resolved_item_id",
    "resolved_item_name",
    "label_resolved",
    *feature_cols
]
df[save_cols].to_parquet(dataset_path, index=False)
print(f"Saved dataset to {dataset_path}")
dataset_path


Saved dataset to e:\Omnichannel\Omnichannel\chat-infrastructure\rag_service\notebooks\menu_query_training\artifacts\clarifications_model_ready.parquet


WindowsPath('e:/Omnichannel/Omnichannel/chat-infrastructure/rag_service/notebooks/menu_query_training/artifacts/clarifications_model_ready.parquet')