# 03 Â· Train Clarification Model

Fit a lightweight classifier on the prepared dataset, falling back to a constant predictor when only one class is available.


In [1]:
from pathlib import Path

import joblib
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def resolve_notebook_dir() -> Path:
    search_roots = [Path.cwd(), *Path.cwd().parents]
    candidates = []
    for root in search_roots:
        candidates.append(root / "notebooks" / "menu_query_training")
        candidates.append(root / "chat-infrastructure" / "rag_service" / "notebooks" / "menu_query_training")
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return Path.cwd() / "chat-infrastructure" / "rag_service" / "notebooks" / "menu_query_training"

NOTEBOOK_DIR = resolve_notebook_dir()
DATA_DIR = NOTEBOOK_DIR / "artifacts"
DATA_DIR.mkdir(parents=True, exist_ok=True)
dataset_path = DATA_DIR / "clarifications_model_ready.parquet"
if not dataset_path.exists():
    raise FileNotFoundError(f"Prepared dataset missing at {dataset_path}. Run 02_prepare_dataset.ipynb first.")

df = pd.read_parquet(dataset_path)
print(f"Loaded {len(df)} rows from {dataset_path}")
df.head()


Loaded 8 rows from e:\Omnichannel\Omnichannel\chat-infrastructure\rag_service\notebooks\menu_query_training\artifacts\clarifications_model_ready.parquet


Unnamed: 0,clarification_id,query_time,raw_query,user_reply,resolved_item_id,resolved_item_name,label_resolved,token_count,has_answer_in_options,answer_length,intent_alcoholPreference,intent_avoidAllergens,intent_courses,intent_ingredientFocus,intent_requireDietary,intent_spice,intent_temperature
0,ec9d7853-8fb4-42e3-b9e9-1605e55541b9,2025-10-29T06:33:42,Need gluten-free options how about drink ?,Beverages,7f10fc4b-1749-4708-85de-ee4edba3935d,Fresh Orange Juice,1,7,1,9,0,1,1,0,0,0,0
1,2cc72e88-1c50-48ff-b320-ee0b248c8a83,2025-10-29T06:33:26,Need gluten-free options,how about drink ?,afe68abc-405b-4a1b-bde9-d9f1f92457cb,Aperol Spritz,1,4,0,17,0,1,0,0,0,0,0
2,a0ea29ce-5422-4e87-8341-648cf46b3c82,2025-10-29T02:29:12,Any vegetarian pasta?,Soups & Salads,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup,1,2,1,14,0,0,1,1,1,0,0
3,4d24c2a3-5fbd-43fa-a180-8162ddfe8b20,2025-10-28T17:37:56,dishes today,Main Courses,9cd431c3-48be-459a-a325-574dad59174c,Seared Scallops,1,2,1,12,0,0,0,0,0,0,0
4,e35286d1-7d4c-42ae-8d18-311abfc0ccb3,2025-10-28T17:37:24,Any vegetarian pasta?,Soups & Salads,dc289bdb-6d3e-46a6-a4b8-bb0b90ef9745,Roasted Tomato Soup,1,2,1,14,0,0,1,1,1,0,0


In [2]:
label_col = "label_resolved"
base_features = ["token_count", "has_answer_in_options", "answer_length"]
intent_cols = sorted([col for col in df.columns if col.startswith("intent_")])
feature_cols = base_features + intent_cols

missing = [col for col in feature_cols if col not in df.columns]
if missing:
    raise ValueError(f"Missing expected feature columns: {missing}")

X_all = df[feature_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
y_all = df[label_col].astype(int)

if y_all.nunique() < 2:
    print("Only one class present in labels; training constant baseline classifier.")
    clf = DummyClassifier(strategy="constant", constant=int(y_all.mode().iloc[0]))
    clf.fit(X_all, y_all)
    metrics = {"accuracy": float((clf.predict(X_all) == y_all).mean())}
    print("Baseline accuracy:", metrics["accuracy"])
    evaluation = metrics
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=0.25, random_state=42, stratify=y_all
    )
    clf = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    print(classification_report(y_test, y_pred))
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC: {roc_auc:.3f}")
    evaluation = {"roc_auc": float(roc_auc)}

model_path = DATA_DIR / "clarification_model.joblib"
metadata = {
    "pipeline": clf,
    "feature_cols": feature_cols,
    "training_metadata": {
        "sample_count": int(len(df)),
        "positive_rate": float(y_all.mean()),
        "evaluation": evaluation
    }
}
joblib.dump(metadata, model_path)
print(f"Saved model to {model_path}")
model_path


Only one class present in labels; training constant baseline classifier.
Baseline accuracy: 1.0
Saved model to e:\Omnichannel\Omnichannel\chat-infrastructure\rag_service\notebooks\menu_query_training\artifacts\clarification_model.joblib


WindowsPath('e:/Omnichannel/Omnichannel/chat-infrastructure/rag_service/notebooks/menu_query_training/artifacts/clarification_model.joblib')