# 03 · Train Clarification Model

Fit a lightweight classifier on the prepared clarification dataset. Run after refreshing data with `01_extract_data.ipynb` and `02_prepare_dataset.ipynb`.

In [1]:
from pathlib import Path

import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

DATA_DIR = Path("notebooks/menu_query_training/artifacts")
dataset_path = DATA_DIR / "clarifications_model_ready.parquet"
if not dataset_path.exists():
    raise FileNotFoundError(f"Prepared dataset missing at {dataset_path}. Run 02_prepare_dataset.ipynb first.")

df = pd.read_parquet(dataset_path)
df.head()


Unnamed: 0,clarification_id,query_time,raw_query,user_reply,resolved_item_id,resolved_item_name,label_resolved,token_count,has_answer_in_options,answer_length,intent_alcoholPreference,intent_avoidAllergens,intent_courses,intent_ingredientFocus,intent_requireDietary,intent_spice,intent_temperature
0,a0ea29ce-5422-4e87-8341-648cf46b3c82,2025-10-29T02:29:12,Any vegetarian pasta?,Soups & Salads,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup,1,2,1,14,0,1,1,1,1,0,0
1,4d24c2a3-5fbd-43fa-a180-8162ddfe8b20,2025-10-28T17:37:56,dishes today,Main Courses,9cd431c3-48be-459a-a325-574dad59174c,Seared Scallops,1,2,1,12,0,1,1,1,1,0,0
2,e35286d1-7d4c-42ae-8d18-311abfc0ccb3,2025-10-28T17:37:24,Any vegetarian pasta?,Soups & Salads,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup,1,2,1,14,0,1,1,1,1,0,0
3,98e40dc4-bae8-48fb-8709-af3bf0c49c7e,2025-10-28T17:23:25,I want some cold drinks,Wine & Cocktails,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz,1,2,1,16,0,1,1,1,1,0,1
4,b83cfad9-be67-4dc0-9f63-0d0a08b15b9d,2025-10-28T17:11:37,I want some cold drinks,with alcohol?,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz,1,2,0,13,0,1,1,1,1,0,1


In [2]:
label_col = "label_resolved"
base_features = ["token_count", "has_answer_in_options", "answer_length"]
intent_cols = sorted([col for col in df.columns if col.startswith("intent_")])
feature_cols = base_features + intent_cols

missing = [col for col in feature_cols if col not in df.columns]
if missing:
    raise ValueError(f"Missing expected feature columns: {missing}")

X = df[feature_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
y = df[label_col].astype(int)

if y.nunique() < 2:
    majority = int(y.mode().iloc[0])
    print("Only one class present in labels. Using constant baseline classifier.")
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )
    X_train.shape, X_test.shape


((4, 10), (2, 10))

In [4]:
from sklearn.dummy import DummyClassifier

if y.nunique() < 2:
    constant = int(y.mode().iloc[0])
    clf = DummyClassifier(strategy="constant", constant=constant)
    clf.fit(X, y)
    print(f"Trained constant classifier with label {constant}")
    y_pred = clf.predict(X)
    y_proba = clf.predict_proba(X)[:, 0]
    print("Accuracy:", (y_pred == y).mean())
else:
    clf = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    print(classification_report(y_test, y_pred))
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC: {roc_auc:.3f}")


Trained constant classifier with label 1
Accuracy: 1.0


In [5]:
model_path = DATA_DIR / "clarification_model.joblib"
metadata = {
    "pipeline": clf,
    "feature_cols": feature_cols,
    "training_metadata": {
        "sample_count": int(len(df)),
        "positive_rate": float(y.mean()),
        "feature_means": {col: float(X[col].mean()) for col in feature_cols}
    }
}
joblib.dump(metadata, model_path)
model_path


WindowsPath('notebooks/menu_query_training/artifacts/clarification_model.joblib')