# 02 · Prepare Training Dataset

Transform the raw clarification export into a clean dataframe with flattened features and labels.

In [1]:
from pathlib import Path
import json

import pandas as pd

DATA_DIR = Path("notebooks/menu_query_training/artifacts")
raw_path = DATA_DIR / "clarifications_raw.parquet"
if not raw_path.exists():
    raise FileNotFoundError(f"Missing raw extract at {raw_path}. Run 01_extract_data.ipynb first.")

df = pd.read_parquet(raw_path)
df.head()

Unnamed: 0,clarification_id,query_time,raw_query,normalized_query,tokens,intents,ambiguity_score,query_metadata,question_text,clarification_metadata,user_reply,resolved_intent,resolution_status,resolved_item_id,resolved_item_name
0,a0ea29ce-5422-4e87-8341-648cf46b3c82,2025-10-29T02:29:12,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'alcoholPreference': None, 'avoidAllergens': ...",0.978,"{'available': True, 'clarificationOptions': ['...","Would you like to focus on Pasta & Risotto, Pi...","{'answerLength': 14, 'options': ['Pasta & Riso...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
1,4d24c2a3-5fbd-43fa-a180-8162ddfe8b20,2025-10-28T17:37:56,dishes today,dishes today,"[dishes, today]","{'alcoholPreference': None, 'avoidAllergens': ...",0.958,"{'available': True, 'clarificationOptions': ['...","Would you like to focus on Main Courses, Pasta...","{'answerLength': 12, 'options': ['Main Courses...",Main Courses,freeform-input,CLARIFIED,9cd431c3-48be-459a-a325-574dad59174c,Seared Scallops
2,e35286d1-7d4c-42ae-8d18-311abfc0ccb3,2025-10-28T17:37:24,Any vegetarian pasta?,any vegetarian pasta?,"[vegetarian, pasta]","{'alcoholPreference': None, 'avoidAllergens': ...",0.978,"{'available': True, 'clarificationOptions': ['...","Would you like to focus on Pasta & Risotto, Pi...","{'answerLength': 14, 'options': ['Pasta & Riso...",Soups & Salads,freeform-input,CLARIFIED,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup
3,98e40dc4-bae8-48fb-8709-af3bf0c49c7e,2025-10-28T17:23:25,I want some cold drinks,i want some cold drinks,"[cold, drinks]","{'alcoholPreference': None, 'avoidAllergens': ...",0.8,"{'available': True, 'clarificationOptions': ['...",Would you like to focus on Beverages or Wine &...,"{'answerLength': 16, 'options': ['Beverages', ...",Wine & Cocktails,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz
4,b83cfad9-be67-4dc0-9f63-0d0a08b15b9d,2025-10-28T17:11:37,I want some cold drinks,i want some cold drinks,"[cold, drinks]","{'alcoholPreference': None, 'avoidAllergens': ...",0.8,"{'available': True, 'clarificationOptions': ['...",Would you like to focus on Beverages or Wine &...,"{'answerLength': 13, 'options': ['Beverages', ...",with alcohol?,freeform-input,CLARIFIED,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz


## Expand JSON columns

Flatten `tokens`, `intents`, and clarification metadata so the ML model receives explicit features.

In [3]:
def ensure_list(value):
    if value is None:
        return []
    if isinstance(value, (list, tuple)):
        return list(value)
    if hasattr(value, 'tolist'):
        try:
            return list(value.tolist())
        except Exception:
            pass
    if isinstance(value, str):
        try:
            parsed = json.loads(value)
            if isinstance(parsed, list):
                return parsed
            if isinstance(parsed, dict):
                return list(parsed.values())
            return [parsed]
        except json.JSONDecodeError:
            return [value]
    if isinstance(value, dict):
        return list(value.values())
    return [value]

def ensure_dict(value):
    if value is None:
        return {}
    if isinstance(value, dict):
        return value
    if hasattr(value, 'items'):
        try:
            return dict(value)
        except Exception:
            pass
    if isinstance(value, str):
        try:
            parsed = json.loads(value)
            return parsed if isinstance(parsed, dict) else {}
        except json.JSONDecodeError:
            return {}
    if isinstance(value, (list, tuple)):
        return {str(i): item for i, item in enumerate(value)}
    if hasattr(value, 'tolist'):
        try:
            seq = value.tolist()
            if isinstance(seq, dict):
                return seq
            return {str(i): item for i, item in enumerate(seq)}
        except Exception:
            pass
    return {}

df["tokens_list"] = df["tokens"].apply(ensure_list)
df["intents_dict"] = df["intents"].apply(ensure_dict)
df["clarification_dict"] = df["clarification_metadata"].apply(ensure_dict)

df[["tokens_list", "intents_dict", "clarification_dict"]].head()


Unnamed: 0,tokens_list,intents_dict,clarification_dict
0,"[vegetarian, pasta]","{'alcoholPreference': None, 'avoidAllergens': ...","{'answerLength': 14, 'options': ['Pasta & Riso..."
1,"[dishes, today]","{'alcoholPreference': None, 'avoidAllergens': ...","{'answerLength': 12, 'options': ['Main Courses..."
2,"[vegetarian, pasta]","{'alcoholPreference': None, 'avoidAllergens': ...","{'answerLength': 14, 'options': ['Pasta & Riso..."
3,"[cold, drinks]","{'alcoholPreference': None, 'avoidAllergens': ...","{'answerLength': 16, 'options': ['Beverages', ..."
4,"[cold, drinks]","{'alcoholPreference': None, 'avoidAllergens': ...","{'answerLength': 13, 'options': ['Beverages', ..."


## Build feature columns

Example features: token counts, intent flags, whether the answer matches one of the suggested options, etc. Modify the feature engineering to match your model's needs.

In [4]:
option_sets = df["clarification_dict"].apply(lambda x: ensure_list(x.get("options")))

df["token_count"] = df["tokens_list"].apply(len)
df["has_answer_in_options"] = [1 if (reply in opts) else 0 for reply, opts in zip(df["user_reply"], option_sets)]
df["answer_length"] = df["user_reply"].fillna("").str.len()

def coerce_intent_value(val):
    if isinstance(val, (bool, int, float)):
        return int(bool(val))
    if isinstance(val, (list, tuple, set)):
        return int(len(val) > 0)
    if isinstance(val, dict):
        return int(len(val) > 0)
    if val is None:
        return 0
    text = str(val).strip()
    return int(bool(text))

intent_keys = sorted({key for intents in df["intents_dict"] for key in intents.keys()})
for key in intent_keys:
    df[f"intent_{key}"] = df["intents_dict"].apply(lambda d, k=key: coerce_intent_value(d.get(k)))

feature_cols = [
    "token_count",
    "has_answer_in_options",
    "answer_length",
    *[f"intent_{key}" for key in intent_keys]
]
df[feature_cols].head()


Unnamed: 0,token_count,has_answer_in_options,answer_length,intent_alcoholPreference,intent_avoidAllergens,intent_courses,intent_ingredientFocus,intent_requireDietary,intent_spice,intent_temperature
0,2,1,14,0,1,1,1,1,0,0
1,2,1,12,0,1,1,1,1,0,0
2,2,1,14,0,1,1,1,1,0,0
3,2,1,16,0,1,1,1,1,0,1
4,2,0,13,0,1,1,1,1,0,1


## Create labels

Here we derive a simple label: whether the clarification resolved automatically (`resolution_status == 'CLARIFIED'`). Adjust to match your training objective (intent classification, reranker fine-tuning, etc.).

In [5]:
df["label_resolved"] = df["resolution_status"].eq("CLARIFIED").astype(int)
df[["resolution_status", "label_resolved"]].value_counts(dropna=False)

resolution_status  label_resolved
CLARIFIED          1                 6
Name: count, dtype: int64

## Persist the modeling dataset

In [6]:
dataset_path = DATA_DIR / "clarifications_model_ready.parquet"
save_cols = [
    "clarification_id",
    "query_time",
    "raw_query",
    "user_reply",
    "resolved_item_id",
    "resolved_item_name",
    "label_resolved",
    *feature_cols
]

df[save_cols].to_parquet(dataset_path, index=False)
dataset_path

WindowsPath('notebooks/menu_query_training/artifacts/clarifications_model_ready.parquet')