In [None]:
# =========================
# 1) Install deps (if needed)
# =========================
# !pip install pandas scikit-learn joblib

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import joblib

# =========================
# 2) Load your data
# =========================
df = pd.read_csv("data.csv")

TARGET_COL = "risk"  # change if your label column name is different

# These are your expected feature columns (rename to match your CSV exactly)
FEATURES = [
    "FAULT_DISTANCE",
    "BASIC_WIND_SPEED",
    "SLOPE",
    "ELEVATION",
    "POTENTIAL_LIQUEFACTION",
    "DISTANCE_TO_RIVERS_AND_SEAS",
    "SURFACE_RUN_OFF",
    "VERTICAL_IRREGUARITY",
    "BUILDING_PROXIMITY",
    "NUMBER_OF_BAYS",
    "COLUMN_SPACING",
    "MAXIMUM_CRACK",
    "ROOF_SLOPE",
    "ROOF_DESIGN",
    "ROOF_FASTENER_DISTANCE",
]

# Keep only relevant cols
df = df[FEATURES + [TARGET_COL]].copy()

X = df[FEATURES]
y = df[TARGET_COL]

# =========================
# 3) Define types
# =========================
# Treat these as categorical (string-like)
categorical_features = [
    "POTENTIAL_LIQUEFACTION",
    "SURFACE_RUN_OFF",
    "ROOF_DESIGN",
    "VERTICAL_IRREGUARITY",
    "BUILDING_PROXIMITY",
]

numeric_features = [c for c in FEATURES if c not in categorical_features]

# =========================
# 4) Preprocess + model pipeline
# =========================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

model = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    class_weight="balanced",
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model),
])

# =========================
# 5) Train/valid split + train
# =========================
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y if y.nunique() > 1 else None
)

clf.fit(X_train, y_train)

# =========================
# 6) Evaluate
# =========================
pred = clf.predict(X_val)
print("Confusion matrix:")
print(confusion_matrix(y_val, pred))
print("\nReport:")
print(classification_report(y_val, pred))

# =========================
# 7) Export for FastAPI
# =========================
joblib.dump(clf, "model.joblib")
print("Saved model.joblib")
