In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from ngboost import NGBoost
from ngboost.distns import Bernoulli
from ngboost.scores import LogScore
import joblib, os

df = pd.read_csv("../DataSets/processed/risk_dataset_model.csv")
TARGET = "had_claim_within_1_year"
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [2]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"
)

X_train_enc = preprocessor.fit_transform(X_train)
X_val_enc = preprocessor.transform(X_val)


In [4]:
ngb = NGBoost(
    Dist=Bernoulli,
    Score=LogScore,
    n_estimators=300,
    learning_rate=0.03,
    natural_gradient=False,
    random_state=42,
    verbose=True
)

ngb.fit(X_train_enc, y_train)


[iter 0] loss=0.5624 val_loss=0.0000 scale=8.0000 norm=3.0005
[iter 100] loss=0.5337 val_loss=0.0000 scale=8.0000 norm=2.8581
[iter 200] loss=0.5307 val_loss=0.0000 scale=8.0000 norm=2.8292


<ngboost.ngboost.NGBoost at 0x16830d67890>

In [5]:
dist = ngb.pred_dist(X_val_enc)

raw = dist.params if hasattr(dist, "params") else dist.params_
p = np.asarray(raw["p1"]).reshape(-1)   # ✅ p1 = claim prob

auc = roc_auc_score(y_val, p)
print("NGBoost ROC-AUC:", round(auc, 4))

uncertainty_std = np.sqrt(p * (1 - p))
print("Uncertainty std:", np.percentile(uncertainty_std, [0,25,50,75,100]))


NGBoost ROC-AUC: 0.6416
Uncertainty std: [0.26031858 0.37834135 0.4239344  0.46260245 0.49999997]


In [7]:
dist = ngb.pred_dist(X_val_enc)

raw = dist.params if hasattr(dist, "params") else dist.params_
y_val_prob = np.asarray(raw["p1"]).reshape(-1)   # ✅ claim probability


In [8]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, classification_report, confusion_matrix
import numpy as np

auc = roc_auc_score(y_val, y_val_prob)
print("NGBoost ROC-AUC:", round(auc, 4))

# Threshold from F1-max (objective)
prec, rec, thr = precision_recall_curve(y_val, y_val_prob)
f1 = (2*prec*rec)/(prec+rec+1e-9)

best_idx = np.argmax(f1)
best_threshold_f1 = thr[best_idx]
print("Best F1 threshold:", float(best_threshold_f1))

# Report @ best threshold
y_pred = (y_val_prob >= best_threshold_f1).astype(int)
print(classification_report(y_val, y_pred, target_names=["No Claim", "Claim"]))
print("Confusion:\n", confusion_matrix(y_val, y_pred))


NGBoost ROC-AUC: 0.6416
Best F1 threshold: 0.2176232922548145
              precision    recall  f1-score   support

    No Claim       0.84      0.48      0.61      8999
       Claim       0.32      0.72      0.44      3001

    accuracy                           0.54     12000
   macro avg       0.58      0.60      0.53     12000
weighted avg       0.71      0.54      0.57     12000

Confusion:
 [[4322 4677]
 [ 834 2167]]


In [6]:
os.makedirs("../models", exist_ok=True)

joblib.dump(
    {
        "model": ngb,
        "preprocessor": preprocessor
    },
    "../models/ngboost_bundle.pkl"
)

print("Saved: ../models/ngboost_bundle.pkl")


Saved: ../models/ngboost_bundle.pkl


In [10]:
import numpy as np
from sklearn.metrics import confusion_matrix

def best_threshold_by_cost(y_true, y_prob, fn_cost=5.0, fp_cost=1.0, grid_size=400):
    thresholds = np.linspace(0.01, 0.99, grid_size)
    costs = []
    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        cost = fn_cost * fn + fp_cost * fp
        costs.append(cost)
    best_idx = int(np.argmin(costs))
    return float(thresholds[best_idx]), float(costs[best_idx])

# Example (insurance: FN is worse than FP)
best_t_cost, best_cost = best_threshold_by_cost(y_val, y_val_prob, fn_cost=5, fp_cost=1)
print("Best cost-based threshold:", best_t_cost, " | cost:", best_cost)


Best cost-based threshold: 0.17210526315789476  | cost: 8424.0


In [11]:
from sklearn.metrics import precision_recall_curve
import numpy as np

def best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, thr = precision_recall_curve(y_true, y_prob)
    fbeta = (1 + beta**2) * (p * r) / ((beta**2 * p) + r + 1e-9)
    best_idx = np.argmax(fbeta)
    return float(thr[best_idx]), float(fbeta[best_idx])

best_t_f2, best_f2 = best_threshold_fbeta(y_val, y_val_prob, beta=2.0)
print("Best F2 threshold:", best_t_f2, " | F2:", best_f2)


Best F2 threshold: 0.13540047450527778  | F2: 0.6312629670660074


In [9]:
from sklearn.utils.class_weight import compute_sample_weight

sw = compute_sample_weight(class_weight="balanced", y=y_train)

ngb.fit(X_train_enc, y_train, sample_weight=sw)


[iter 0] loss=0.7813 val_loss=0.0000 scale=8.0000 norm=2.8170
[iter 100] loss=0.6458 val_loss=0.0000 scale=8.0000 norm=3.6646
[iter 200] loss=0.6433 val_loss=0.0000 scale=8.0000 norm=3.6562


<ngboost.ngboost.NGBoost at 0x16830d67890>

In [12]:
from sklearn.metrics import classification_report, confusion_matrix

def report_at_threshold(y_true, y_prob, t, title=""):
    y_pred = (y_prob >= t).astype(int)
    print("\n" + title)
    print("Threshold:", t)
    print(classification_report(y_true, y_pred, target_names=["No Claim","Claim"]))
    print("Confusion:\n", confusion_matrix(y_true, y_pred))

# 1) Cost-based threshold
t_cost, _ = best_threshold_by_cost(y_val, y_val_prob, fn_cost=5, fp_cost=1)
report_at_threshold(y_val, y_val_prob, t_cost, title="COST-BASED (FN=5, FP=1)")

# 2) F2 threshold
t_f2, _ = best_threshold_fbeta(y_val, y_val_prob, beta=2.0)
report_at_threshold(y_val, y_val_prob, t_f2, title="F2-OPTIMAL (recall-focused)")



COST-BASED (FN=5, FP=1)
Threshold: 0.17210526315789476
              precision    recall  f1-score   support

    No Claim       0.87      0.29      0.43      8999
       Claim       0.29      0.87      0.43      3001

    accuracy                           0.43     12000
   macro avg       0.58      0.58      0.43     12000
weighted avg       0.72      0.43      0.43     12000

Confusion:
 [[2565 6434]
 [ 398 2603]]

F2-OPTIMAL (recall-focused)
Threshold: 0.13540047450527778
              precision    recall  f1-score   support

    No Claim       0.90      0.13      0.23      8999
       Claim       0.27      0.95      0.42      3001

    accuracy                           0.34     12000
   macro avg       0.58      0.54      0.33     12000
weighted avg       0.74      0.34      0.28     12000

Confusion:
 [[1210 7789]
 [ 141 2860]]


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from ngboost import NGBoost
from ngboost.distns import Bernoulli
from ngboost.scores import LogScore
import joblib, os

df = pd.read_csv("../DataSets/processed/risk_dataset_model.csv")

TARGET = "had_claim_within_1_year"
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [13]:
import pandas as pd

# 1) One-hot encode categorical columns
X_train_enc = pd.get_dummies(X_train, drop_first=False)
X_val_enc   = pd.get_dummies(X_val, drop_first=False)

# 2) Align columns (important!)
X_train_enc, X_val_enc = X_train_enc.align(X_val_enc, join="left", axis=1, fill_value=0)

# 3) Make sure y is int 0/1
y_train_int = y_train.astype(int)
y_val_int   = y_val.astype(int)

print("Encoded train shape:", X_train_enc.shape)
print("Encoded val shape  :", X_val_enc.shape)


Encoded train shape: (48000, 60)
Encoded val shape  : (12000, 60)


In [14]:
ngb_model = NGBoost(
    Dist=Bernoulli,
    Score=LogScore,
    n_estimators=300,
    learning_rate=0.03,
    natural_gradient=False,   
    random_state=42,
    verbose=True
)

ngb_model.fit(X_train_enc, y_train_int)

[iter 0] loss=0.5624 val_loss=0.0000 scale=8.0000 norm=3.0005
[iter 100] loss=0.5337 val_loss=0.0000 scale=8.0000 norm=2.8579
[iter 200] loss=0.5307 val_loss=0.0000 scale=8.0000 norm=2.8295


<ngboost.ngboost.NGBoost at 0x231fff2f610>

In [15]:
import numpy as np
dist = ngb_model.pred_dist(X_val_enc)

# NGBoost returns both class probabilities
raw = dist.params if hasattr(dist, "params") else dist.params_

# Claim probability = p1
y_val_prob_ngb = np.asarray(raw["p1"]).reshape(-1)

print("Probability summary:")
print(pd.Series(y_val_prob_ngb).describe())


Probability summary:
count    12000.000000
mean         0.250659
std          0.101648
min          0.071337
25%          0.172999
50%          0.234657
75%          0.310324
max          0.703310
dtype: float64


In [16]:
from sklearn.metrics import roc_auc_score

print(
    "NGBoost ROC-AUC:",
    round(roc_auc_score(y_val_int, y_val_prob_ngb), 4)
)


NGBoost ROC-AUC: 0.6412


In [17]:
y_pred_ngb = (y_val_prob_ngb >= 0.5).astype(int)


In [18]:
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

print("Accuracy              :", round(accuracy_score(y_val_int, y_pred_ngb), 4))
print("Balanced Accuracy     :", round(balanced_accuracy_score(y_val_int, y_pred_ngb), 4))
print("Precision (Claim)     :", round(precision_score(y_val_int, y_pred_ngb), 4))
print("Recall (Claim)        :", round(recall_score(y_val_int, y_pred_ngb), 4))
print("F1-score (Claim)      :", round(f1_score(y_val_int, y_pred_ngb), 4))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val_int, y_pred_ngb))

print("\nClassification Report:")
print(classification_report(
    y_val_int,
    y_pred_ngb,
    target_names=["No Claim", "Claim"]
))


Accuracy              : 0.7493
Balanced Accuracy     : 0.5142
Precision (Claim)     : 0.487
Recall (Claim)        : 0.0437
F1-score (Claim)      : 0.0801

Confusion Matrix:
[[8861  138]
 [2870  131]]

Classification Report:
              precision    recall  f1-score   support

    No Claim       0.76      0.98      0.85      8999
       Claim       0.49      0.04      0.08      3001

    accuracy                           0.75     12000
   macro avg       0.62      0.51      0.47     12000
weighted avg       0.69      0.75      0.66     12000



In [None]:
import joblib, os

os.makedirs("../models", exist_ok=True)

ngboost_bundle = {
    "model": ngb_model,                       # trained NGBoost
    "feature_columns": X_train_enc.columns.tolist(),
    "model_type": "NGBoost-Bernoulli",
    "note": "Final NGBoost risk model with one-hot encoded features"
}

joblib.dump(ngboost_bundle, "../models/ngboost_risk_bundle.pkl")

print("NGBoost bundle saved correctly")


NGBoost bundle saved correctly


: 