In [1]:
import time
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

def stage(msg: str):
    print("\n" + "="*80)
    print(msg)
    print("="*80)

def timed(msg: str):
    # simple context manager-ish helper
    class _T:
        def __enter__(self):
            self.t0 = time.time()
            print(f"▶ {msg} ...")
            return self
        def __exit__(self, exc_type, exc, tb):
            dt = time.time() - self.t0
            if exc_type is None:
                print(f"✅ Done in {dt:.2f}s")
            else:
                print(f"❌ Failed after {dt:.2f}s")
    return _T()


In [2]:
stage("Stage 1/7 — Load dataset + verify columns")

DATA_PATH = "data.csv"
TARGET_COL = "risk"

FEATURES = [
    "FAULT_DISTANCE",
    "BASIC_WIND_SPEED",
    "SLOPE",
    "ELEVATION",
    "POTENTIAL_LIQUEFACTION",
    "DISTANCE_TO_RIVERS_AND_SEAS",
    "SURFACE_RUN_OFF",
    "VERTICAL_IRREGUARITY",
    "BUILDING_PROXIMITY",
    "NUMBER_OF_BAYS",
    "COLUMN_SPACING",
    "MAXIMUM_CRACK",
    "ROOF_SLOPE",
    "ROOF_DESIGN",
    "ROOF_FASTENER_DISTANCE",
]

with timed(f"Reading {DATA_PATH}"):
    df = pd.read_csv(DATA_PATH)

print("Rows:", len(df))
print("Columns:", len(df.columns))

missing = [c for c in FEATURES + [TARGET_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

df = df[FEATURES + [TARGET_COL]].copy()

print("\nTarget distribution:")
print(df[TARGET_COL].value_counts(dropna=False))
df.head(3)



Stage 1/7 — Load dataset + verify columns
▶ Reading data.csv ...
✅ Done in 0.01s
Rows: 600
Columns: 16

Target distribution:
risk
LOW       200
HIGH      200
MEDIUM    200
Name: count, dtype: int64


Unnamed: 0,FAULT_DISTANCE,BASIC_WIND_SPEED,SLOPE,ELEVATION,POTENTIAL_LIQUEFACTION,DISTANCE_TO_RIVERS_AND_SEAS,SURFACE_RUN_OFF,VERTICAL_IRREGUARITY,BUILDING_PROXIMITY,NUMBER_OF_BAYS,COLUMN_SPACING,MAXIMUM_CRACK,ROOF_SLOPE,ROOF_DESIGN,ROOF_FASTENER_DISTANCE,risk
0,46.5,147.16,5.74,116.33,MEDIUM,17.31,GOOD,NO,MODERATE,5,5.83,0.07,30.91,GABLE,9.3,LOW
1,12.62,181.23,2.42,45.84,LOW,2.03,GOOD,NO,MODERATE,5,5.16,0.17,34.37,HIP,14.67,LOW
2,0.97,225.99,26.19,12.54,HIGH,0.29,MODERATE,YES,CLOSE,3,8.29,3.54,6.78,FLAT,27.66,HIGH


In [3]:
stage("Stage 2/7 — Define feature types")

categorical_features = [
    "POTENTIAL_LIQUEFACTION",
    "SURFACE_RUN_OFF",
    "ROOF_DESIGN",
    "VERTICAL_IRREGUARITY",
    "BUILDING_PROXIMITY",
]

numeric_features = [c for c in FEATURES if c not in categorical_features]

print("Categorical features:", categorical_features)
print("Numeric features:", numeric_features)

X = df[FEATURES]
y = df[TARGET_COL]



Stage 2/7 — Define feature types
Categorical features: ['POTENTIAL_LIQUEFACTION', 'SURFACE_RUN_OFF', 'ROOF_DESIGN', 'VERTICAL_IRREGUARITY', 'BUILDING_PROXIMITY']
Numeric features: ['FAULT_DISTANCE', 'BASIC_WIND_SPEED', 'SLOPE', 'ELEVATION', 'DISTANCE_TO_RIVERS_AND_SEAS', 'NUMBER_OF_BAYS', 'COLUMN_SPACING', 'MAXIMUM_CRACK', 'ROOF_SLOPE', 'ROOF_FASTENER_DISTANCE']


In [4]:
stage("Stage 3/7 — Build preprocessing pipeline")

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

print("✅ Preprocess pipeline created.")



Stage 3/7 — Build preprocessing pipeline
✅ Preprocess pipeline created.


In [5]:
stage("Stage 4/7 — Train/validation split")

with timed("Splitting train/val"):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y if y.nunique() > 1 else None
    )

print("Train size:", len(X_train))
print("Val size:", len(X_val))
print("\nTrain target distribution:")
print(y_train.value_counts())
print("\nVal target distribution:")
print(y_val.value_counts())



Stage 4/7 — Train/validation split
▶ Splitting train/val ...
✅ Done in 0.01s
Train size: 480
Val size: 120

Train target distribution:
risk
MEDIUM    160
HIGH      160
LOW       160
Name: count, dtype: int64

Val target distribution:
risk
HIGH      40
LOW       40
MEDIUM    40
Name: count, dtype: int64


In [6]:
stage("Stage 5/7 — Train RandomForest (with progress)")

model = RandomForestClassifier(
    n_estimators=150,        # prototype speed; increase later if needed
    random_state=42,
    class_weight="balanced",
    n_jobs=-1,               # IMPORTANT: use all cores
    verbose=1                # shows training progress in output
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model),
])

with timed("Fitting model (watch verbose output below)"):
    clf.fit(X_train, y_train)

print("✅ Training finished.")



Stage 5/7 — Train RandomForest (with progress)
▶ Fitting model (watch verbose output below) ...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    0.1s finished


✅ Done in 0.24s
✅ Training finished.


In [7]:
stage("Stage 6/7 — Evaluate")

with timed("Predicting on validation set"):
    pred = clf.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred))

print("\nClassification report:")
print(classification_report(y_val, pred))



Stage 6/7 — Evaluate
▶ Predicting on validation set ...
✅ Done in 0.04s
Confusion matrix:
[[40  0  0]
 [ 0 40  0]
 [ 0  0 40]]

Classification report:
              precision    recall  f1-score   support

        HIGH       1.00      1.00      1.00        40
         LOW       1.00      1.00      1.00        40
      MEDIUM       1.00      1.00      1.00        40

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 150 out of 150 | elapsed:    0.0s finished


In [8]:
stage("Stage 7/7 — Save model for FastAPI")

MODEL_OUT = "model.joblib"

with timed(f"Saving model to {MODEL_OUT}"):
    joblib.dump(clf, MODEL_OUT)

print(f"✅ Saved: {MODEL_OUT}")



Stage 7/7 — Save model for FastAPI
▶ Saving model to model.joblib ...
✅ Done in 0.06s
✅ Saved: model.joblib
