In [1]:
import time
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

def stage(msg: str):
    print("\n" + "="*80)
    print(msg)
    print("="*80)

def timed(msg: str):
    # simple context manager-ish helper
    class _T:
        def __enter__(self):
            self.t0 = time.time()
            print(f"▶ {msg} ...")
            return self
        def __exit__(self, exc_type, exc, tb):
            dt = time.time() - self.t0
            if exc_type is None:
                print(f"✅ Done in {dt:.2f}s")
            else:
                print(f"❌ Failed after {dt:.2f}s")
    return _T()

    stage("Stage 1/7 — Load dataset + verify columns")

DATA_PATH = "data.csv"
TARGET_COL = "risk"

FEATURES = [
    "FAULT_DISTANCE",
    "BASIC_WIND_SPEED",
    "SLOPE",
    "ELEVATION",
    "POTENTIAL_LIQUEFACTION",
    "DISTANCE_TO_RIVERS_AND_SEAS",
    "SURFACE_RUN_OFF",
    "VERTICAL_IRREGUARITY",
    "BUILDING_PROXIMITY",
    "NUMBER_OF_BAYS",
    "COLUMN_SPACING",
    "MAXIMUM_CRACK",
    "ROOF_SLOPE",
    "ROOF_DESIGN",
    "ROOF_FASTENER_DISTANCE",
]

with timed(f"Reading {DATA_PATH}"):
    df = pd.read_csv(DATA_PATH)

print("Rows:", len(df))
print("Columns:", len(df.columns))

missing = [c for c in FEATURES + [TARGET_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

df = df[FEATURES + [TARGET_COL]].copy()

print("\nTarget distribution:")
print(df[TARGET_COL].value_counts(dropna=False))
df.head(3)

stage("Stage 2/7 — Define feature types")

categorical_features = [
    "POTENTIAL_LIQUEFACTION",
    "SURFACE_RUN_OFF",
    "ROOF_DESIGN",
    "VERTICAL_IRREGUARITY",
    "BUILDING_PROXIMITY",
]

numeric_features = [c for c in FEATURES if c not in categorical_features]

print("Categorical features:", categorical_features)
print("Numeric features:", numeric_features)

X = df[FEATURES]
y = df[TARGET_COL]

stage("Stage 3/7 — Build preprocessing pipeline")

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

print("✅ Preprocess pipeline created.")

stage("Stage 4/7 — Train/validation split")

with timed("Splitting train/val"):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y if y.nunique() > 1 else None
    )

print("Train size:", len(X_train))
print("Val size:", len(X_val))
print("\nTrain target distribution:")
print(y_train.value_counts())
print("\nVal target distribution:")
print(y_val.value_counts())

stage("Stage 5/7 — Train RandomForest (with progress)")

model = RandomForestClassifier(
    n_estimators=150,        # prototype speed; increase later if needed
    random_state=42,
    class_weight="balanced",
    n_jobs=-1,               # IMPORTANT: use all cores
    verbose=1                # shows training progress in output
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model),
])

with timed("Fitting model (watch verbose output below)"):
    clf.fit(X_train, y_train)

print("✅ Training finished.")

stage("Stage 6/7 — Evaluate")

with timed("Predicting on validation set"):
    pred = clf.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred))

print("\nClassification report:")
print(classification_report(y_val, pred))

stage("Stage 7/7 — Save model for FastAPI")

MODEL_OUT = "model.joblib"

with timed(f"Saving model to {MODEL_OUT}"):
    joblib.dump(clf, MODEL_OUT)

print(f"✅ Saved: {MODEL_OUT}")

In [1]:
import time
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

def stage(msg: str):
    print("\n" + "="*80)
    print(msg)
    print("="*80)

def timed(msg: str):
    class _T:
        def __enter__(self):
            self.t0 = time.time()
            print(f"▶ {msg} ...")
            return self
        def __exit__(self, exc_type, exc, tb):
            dt = time.time() - self.t0
            if exc_type is None:
                print(f"✅ Done in {dt:.2f}s")
            else:
                print(f"❌ Failed after {dt:.2f}s")
    return _T()

# -----------------------------------------------------------------------------
# 1) Questionnaire schema
# -----------------------------------------------------------------------------

DATA_PATH = "data.csv"

# Pick ONE target column name used in your CSV:
TARGET_COL = "risk"  # or "RISK_DESCRIPTION" etc.

# All questionnaire item features (store values as integers 1/2/3)
FEATURES = [
    # HAZARD (A)
    "A1_1_PEIS",
    "A1_2_FAULT_DISTANCE",
    "A1_3_SEISMIC_SOURCE_TYPE",
    "A1_4_LIQUEFACTION",
    "A2_1_BASIC_WIND_SPEED",
    "A2_2_BUILDING_VICINITY",
    "A3_1_SLOPE",
    "A3_2_ELEVATION",
    "A3_3_DISTANCE_TO_RIVERS_AND_SEAS",
    "A3_4_SURFACE_RUNOFF",
    "A3_5_BASE_HEIGHT",
    "A3_6_DRAINAGE_SYSTEM",

    # EXPOSURE (B)
    "B1_1_AESTHETIC_THEME",
    "B1_2_STYLE_UNIQUE",
    "B1_3_STYLE_TYPICAL",
    "B1_4_CITYSCAPE_INTEGRATION",
    "B2_1_AGE_OF_BUILDING",
    "B2_2_PAST_RELEVANCE",
    "B2_3_GEO_IMPACT",
    "B2_4_CULTURAL_HERITAGE_TIE",
    "B2_5_MESSAGE_WORTH_PRESERVING",
    "B3_1_NO_INITIATIVES",
    "B3_2_PROMINENT_SUPPORT",
    "B3_3_IMPORTANCE_DAILY_LIFE",
    "B3_4_NO_PROMOTION",
    "B4_1_TOURIST_MUST_SEE",
    "B4_2_TOURISM_CONTRIBUTION",
    "B4_3_VISITED_FOR_GOODS",
    "B4_4_CURRENT_USE_ADOPTS_NEEDS",

    # VULNERABILITY (C)
    "C1_1_CODE_YEAR_BUILT",
    "C1_2_PLAN_IRREGULARITY",
    "C1_3_VERTICAL_IRREGULARITY",
    "C1_4_BUILDING_PROXIMITY",
    "C1_5_NUMBER_OF_STOREYS",
    "C1_6_STRUCT_SYSTEM_MATERIAL",
    "C1_7_NUMBER_OF_BAYS",
    "C1_8_COLUMN_SPACING",
    "C1_9_BUILDING_ENCLOSURE",
    "C1_10_WALL_MATERIAL",
    "C1_11_FRAMING_TYPE",
    "C1_12_FLOORING_MATERIAL",
    "C2_1_CRACK_WIDTH",
    "C2_2_UNEVEN_SETTLEMENT",
    "C2_3_BEAM_COLUMN_DEFORMATION",
    "C2_4_FINISHING_DETERIORATION",
    "C2_5_MEMBER_DECAY",
    "C2_6_ADDITIONAL_LOADS",
    "C3_1_ROOF_DESIGN",
    "C3_2_ROOF_SLOPE",
    "C3_3_ROOFING_MATERIAL",
    "C4_1_ROOF_FASTENERS",
    "C4_2_FASTENER_SPACING",
]

# Weights from your questionnaire table (the left numeric weight per item).
# (These are constants used only to compute engineered totals.)
WEIGHTS = {
    # A - hazard
    "A1_1_PEIS": 3,
    "A1_2_FAULT_DISTANCE": 3,
    "A1_3_SEISMIC_SOURCE_TYPE": 3,
    "A1_4_LIQUEFACTION": 3,
    "A2_1_BASIC_WIND_SPEED": 2,
    "A2_2_BUILDING_VICINITY": 2,
    "A3_1_SLOPE": 1,
    "A3_2_ELEVATION": 1,
    "A3_3_DISTANCE_TO_RIVERS_AND_SEAS": 3,
    "A3_4_SURFACE_RUNOFF": 1,
    "A3_5_BASE_HEIGHT": 1,
    "A3_6_DRAINAGE_SYSTEM": 2,

    # B - exposure (keep weights you’re using in the sheet)
    "B1_1_AESTHETIC_THEME": 2,
    "B1_2_STYLE_UNIQUE": 1,
    "B1_3_STYLE_TYPICAL": 1,
    "B1_4_CITYSCAPE_INTEGRATION": 2,
    "B2_1_AGE_OF_BUILDING": 2,
    "B2_2_PAST_RELEVANCE": 3,
    "B2_3_GEO_IMPACT": 1,
    "B2_4_CULTURAL_HERITAGE_TIE": 2,
    "B2_5_MESSAGE_WORTH_PRESERVING": 2,
    "B3_1_NO_INITIATIVES": 3,
    "B3_2_PROMINENT_SUPPORT": 3,
    "B3_3_IMPORTANCE_DAILY_LIFE": 2,
    "B3_4_NO_PROMOTION": 3,
    "B4_1_TOURIST_MUST_SEE": 2,
    "B4_2_TOURISM_CONTRIBUTION": 1,
    "B4_3_VISITED_FOR_GOODS": 1,
    "B4_4_CURRENT_USE_ADOPTS_NEEDS": 2,

    # C - vulnerability
    "C1_1_CODE_YEAR_BUILT": 3,
    "C1_2_PLAN_IRREGULARITY": 3,
    "C1_3_VERTICAL_IRREGULARITY": 2,
    "C1_4_BUILDING_PROXIMITY": 1,
    "C1_5_NUMBER_OF_STOREYS": 2,
    "C1_6_STRUCT_SYSTEM_MATERIAL": 1,
    "C1_7_NUMBER_OF_BAYS": 3,
    "C1_8_COLUMN_SPACING": 1,
    "C1_9_BUILDING_ENCLOSURE": 3,
    "C1_10_WALL_MATERIAL": 3,
    "C1_11_FRAMING_TYPE": 3,
    "C1_12_FLOORING_MATERIAL": 1,
    "C2_1_CRACK_WIDTH": 2,
    "C2_2_UNEVEN_SETTLEMENT": 1,
    "C2_3_BEAM_COLUMN_DEFORMATION": 3,
    "C2_4_FINISHING_DETERIORATION": 3,
    "C2_5_MEMBER_DECAY": 3,
    "C2_6_ADDITIONAL_LOADS": 1,
    "C3_1_ROOF_DESIGN": 3,
    "C3_2_ROOF_SLOPE": 3,
    "C3_3_ROOFING_MATERIAL": 2,
    "C4_1_ROOF_FASTENERS": 2,
    "C4_2_FASTENER_SPACING": 2,
}

A_COLS = [c for c in FEATURES if c.startswith("A")]
B_COLS = [c for c in FEATURES if c.startswith("B")]
C_COLS = [c for c in FEATURES if c.startswith("C")]

def add_engineered_scores(df: pd.DataFrame) -> pd.DataFrame:
    # Weighted totals
    def wsum(cols):
        return sum(df[c].astype(float) * WEIGHTS.get(c, 1) for c in cols)

    df = df.copy()
    df["HAZARD_SCORE"] = wsum(A_COLS)
    df["EXPOSURE_SCORE"] = wsum(B_COLS)
    df["VULNERABILITY_SCORE"] = wsum(C_COLS)

    # Simple combined index (you can replace with your spreadsheet formula if you have one)
    df["RISK_INDEX"] = df["HAZARD_SCORE"] + df["EXPOSURE_SCORE"] + df["VULNERABILITY_SCORE"]
    return df

# -----------------------------------------------------------------------------
# 2) Load + verify
# -----------------------------------------------------------------------------

stage("Stage 1/7 — Load dataset + verify columns")

with timed(f"Reading {DATA_PATH}"):
    df = pd.read_csv(DATA_PATH)

missing = [c for c in FEATURES + [TARGET_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

df = df[FEATURES + [TARGET_COL]].copy()

# Ensure features are numeric 1/2/3 (strings -> numeric)
for c in FEATURES:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Add engineered totals (hazard/exposure/vulnerability + risk index)
df = add_engineered_scores(df)

ALL_FEATURES = FEATURES + ["HAZARD_SCORE", "EXPOSURE_SCORE", "VULNERABILITY_SCORE", "RISK_INDEX"]

print("Rows:", len(df))
print("Target distribution:")
print(df[TARGET_COL].value_counts(dropna=False))

# -----------------------------------------------------------------------------
# 3) Train/val split
# -----------------------------------------------------------------------------

stage("Stage 2/7 — Train/validation split")

X = df[ALL_FEATURES]
y = df[TARGET_COL]

with timed("Splitting train/val"):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y if y.nunique() > 1 else None
    )

print("Train size:", len(X_train))
print("Val size:", len(X_val))

# -----------------------------------------------------------------------------
# 4) Preprocess + model
# -----------------------------------------------------------------------------

stage("Stage 3/7 — Preprocess + model pipeline")

# Everything is numeric (1/2/3 + engineered totals), so keep it simple:
preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler()),
])

model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1,
    verbose=1
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model),
])

# -----------------------------------------------------------------------------
# 5) Fit
# -----------------------------------------------------------------------------

stage("Stage 4/7 — Train")

with timed("Fitting model"):
    clf.fit(X_train, y_train)

# -----------------------------------------------------------------------------
# 6) Evaluate
# -----------------------------------------------------------------------------

stage("Stage 5/7 — Evaluate")

with timed("Predicting on validation set"):
    pred = clf.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred))

print("\nClassification report:")
print(classification_report(y_val, pred))

# -----------------------------------------------------------------------------
# 7) Save
# -----------------------------------------------------------------------------

stage("Stage 6/7 — Save model + schema")

MODEL_OUT = "questionnaire_model.joblib"
SCHEMA_OUT = "questionnaire_schema.joblib"

with timed(f"Saving model to {MODEL_OUT}"):
    joblib.dump(clf, MODEL_OUT)

# Save the exact feature order your FastAPI must send
schema = {"features": ALL_FEATURES}
joblib.dump(schema, SCHEMA_OUT)

print(f"✅ Saved: {MODEL_OUT}")
print(f"✅ Saved: {SCHEMA_OUT}")



Stage 1/7 — Load dataset + verify columns
▶ Reading data.csv ...
✅ Done in 0.02s
Rows: 1000
Target distribution:
risk
HIGH        338
MODERATE    332
LOW         330
Name: count, dtype: int64

Stage 2/7 — Train/validation split
▶ Splitting train/val ...
✅ Done in 0.01s
Train size: 800
Val size: 200

Stage 3/7 — Preprocess + model pipeline

Stage 4/7 — Train
▶ Fitting model ...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.7s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 300 out of 300 | elapsed:    0.0s finished


✅ Done in 0.98s

Stage 5/7 — Evaluate
▶ Predicting on validation set ...
✅ Done in 0.10s
Confusion matrix:
[[68  0  0]
 [ 0 66  0]
 [ 0  0 66]]

Classification report:
              precision    recall  f1-score   support

        HIGH       1.00      1.00      1.00        68
         LOW       1.00      1.00      1.00        66
    MODERATE       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Stage 6/7 — Save model + schema
▶ Saving model to questionnaire_model.joblib ...
✅ Done in 0.30s
✅ Saved: questionnaire_model.joblib
✅ Saved: questionnaire_schema.joblib


In [3]:
import time
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

def stage(msg: str):
    print("\n" + "="*80)
    print(msg)
    print("="*80)

def timed(msg: str):
    class _T:
        def __enter__(self):
            self.t0 = time.time()
            print(f"▶ {msg} ...")
            return self
        def __exit__(self, exc_type, exc, tb):
            dt = time.time() - self.t0
            if exc_type is None:
                print(f"✅ Done in {dt:.2f}s")
            else:
                print(f"❌ Failed after {dt:.2f}s")
    return _T()

# -----------------------------------------------------------------------------
# Config
# -----------------------------------------------------------------------------
DATA_PATH = "data_2000_raw.csv"  # <-- change to your csv file
TARGET_COL = "risk"              # LOW / MEDIUM / HIGH

FEATURES = [
    "A1_1_PEIS","A1_2_FAULT_DISTANCE","A1_3_SEISMIC_SOURCE_TYPE","A1_4_LIQUEFACTION",
    "A2_1_BASIC_WIND_SPEED","A2_2_BUILDING_VICINITY","A3_1_SLOPE","A3_2_ELEVATION",
    "A3_3_DISTANCE_TO_RIVERS_AND_SEAS","A3_4_SURFACE_RUNOFF","A3_5_BASE_HEIGHT","A3_6_DRAINAGE_SYSTEM",

    "B1_1_AESTHETIC_THEME","B1_2_STYLE_UNIQUE","B1_3_STYLE_TYPICAL","B1_4_CITYSCAPE_INTEGRATION",
    "B2_1_AGE_OF_BUILDING","B2_2_PAST_RELEVANCE","B2_3_GEO_IMPACT","B2_4_CULTURAL_HERITAGE_TIE",
    "B2_5_MESSAGE_WORTH_PRESERVING","B3_1_NO_INITIATIVES","B3_2_PROMINENT_SUPPORT",
    "B3_3_IMPORTANCE_DAILY_LIFE","B3_4_NO_PROMOTION","B4_1_TOURIST_MUST_SEE","B4_2_TOURISM_CONTRIBUTION",
    "B4_3_VISITED_FOR_GOODS","B4_4_CURRENT_USE_ADOPTS_NEEDS",

    "C1_1_CODE_YEAR_BUILT","C1_2_PLAN_IRREGULARITY","C1_3_VERTICAL_IRREGULARITY","C1_4_BUILDING_PROXIMITY",
    "C1_5_NUMBER_OF_STOREYS","C1_6_STRUCT_SYSTEM_MATERIAL","C1_7_NUMBER_OF_BAYS","C1_8_COLUMN_SPACING",
    "C1_9_BUILDING_ENCLOSURE","C1_10_WALL_MATERIAL","C1_11_FRAMING_TYPE","C1_12_FLOORING_MATERIAL",
    "C2_1_CRACK_WIDTH","C2_2_UNEVEN_SETTLEMENT","C2_3_BEAM_COLUMN_DEFORMATION","C2_4_FINISHING_DETERIORATION",
    "C2_5_MEMBER_DECAY","C2_6_ADDITIONAL_LOADS","C3_1_ROOF_DESIGN","C3_2_ROOF_SLOPE","C3_3_ROOFING_MATERIAL",
    "C4_1_ROOF_FASTENERS","C4_2_FASTENER_SPACING",
]

# -----------------------------------------------------------------------------
# 1) Load + verify
# -----------------------------------------------------------------------------
stage("Stage 1/6 — Load dataset + verify columns")

with timed(f"Reading {DATA_PATH}"):
    df = pd.read_csv(DATA_PATH)

missing = [c for c in FEATURES + [TARGET_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

df = df[FEATURES + [TARGET_COL]].copy()

# Ensure numeric features
for c in FEATURES:
    df[c] = pd.to_numeric(df[c], errors="coerce")

print("Rows:", len(df))
print("Target distribution:")
print(df[TARGET_COL].value_counts(dropna=False))

# -----------------------------------------------------------------------------
# 2) Split
# -----------------------------------------------------------------------------
stage("Stage 2/6 — Train/validation split")

X = df[FEATURES]
y = df[TARGET_COL]

with timed("Splitting train/val"):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

print("Train size:", len(X_train))
print("Val size:", len(X_val))

# -----------------------------------------------------------------------------
# 3) Preprocess + model
# -----------------------------------------------------------------------------
stage("Stage 3/6 — Preprocess + model pipeline")

preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler()),  # OK even if RF doesn't need it
])

model = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1,
    verbose=1
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model),
])

# -----------------------------------------------------------------------------
# 4) Fit
# -----------------------------------------------------------------------------
stage("Stage 4/6 — Train")

with timed("Fitting model"):
    clf.fit(X_train, y_train)

# -----------------------------------------------------------------------------
# 5) Evaluate
# -----------------------------------------------------------------------------
stage("Stage 5/6 — Evaluate")

with timed("Predicting on validation set"):
    pred = clf.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred))

print("\nClassification report:")
print(classification_report(y_val, pred))

# -----------------------------------------------------------------------------
# 6) Save
# -----------------------------------------------------------------------------
stage("Stage 6/6 — Save model + schema")

MODEL_OUT = "questionnaire_model.joblib"
SCHEMA_OUT = "questionnaire_schema.joblib"

with timed(f"Saving model to {MODEL_OUT}"):
    joblib.dump(clf, MODEL_OUT)

schema = {"features": FEATURES, "target": TARGET_COL}
with timed(f"Saving schema to {SCHEMA_OUT}"):
    joblib.dump(schema, SCHEMA_OUT)

print(f"✅ Saved: {MODEL_OUT}")
print(f"✅ Saved: {SCHEMA_OUT}")



Stage 1/6 — Load dataset + verify columns
▶ Reading data_2000_raw.csv ...
✅ Done in 0.05s
Rows: 2000
Target distribution:
risk
LOW       1006
MEDIUM     664
HIGH       330
Name: count, dtype: int64

Stage 2/6 — Train/validation split
▶ Splitting train/val ...
✅ Done in 0.01s
Train size: 1600
Val size: 400

Stage 3/6 — Preprocess + model pipeline

Stage 4/6 — Train
▶ Fitting model ...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    0.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    0.0s finished


✅ Done in 0.64s

Stage 5/6 — Evaluate
▶ Predicting on validation set ...
✅ Done in 0.08s
Confusion matrix:
[[ 49   0  17]
 [  0 195   6]
 [  2   7 124]]

Classification report:
              precision    recall  f1-score   support

        HIGH       0.96      0.74      0.84        66
         LOW       0.97      0.97      0.97       201
      MEDIUM       0.84      0.93      0.89       133

    accuracy                           0.92       400
   macro avg       0.92      0.88      0.90       400
weighted avg       0.92      0.92      0.92       400


Stage 6/6 — Save model + schema
▶ Saving model to questionnaire_model.joblib ...
✅ Done in 0.14s
▶ Saving schema to questionnaire_schema.joblib ...
✅ Done in 0.00s
✅ Saved: questionnaire_model.joblib
✅ Saved: questionnaire_schema.joblib
