In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from interpret.glassbox import ExplainableBoostingClassifier
import joblib, os

df = pd.read_csv("../DataSets/processed/risk_dataset_model.csv")
TARGET = "had_claim_within_1_year"
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)
print("Shape:", df.shape)


Shape: (60000, 20)


In [2]:
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

print("Categorical:", len(cat_cols), cat_cols)
print("Numeric:", len(num_cols), num_cols)


Categorical: 10 ['driver_age_band', 'driver_gender', 'driver_occupation', 'vehicle_type', 'vehicle_segment', 'fuel_type', 'vehicle_age_band', 'vehicle_usage_type', 'registration_district', 'parking_type']
Numeric: 9 ['driver_age', 'years_of_driving_experience', 'member_automobile_assoc_ceylon', 'has_previous_motor_policy', 'accidents_last_3_years', 'ncb_percentage', 'engine_capacity_cc', 'vehicle_age_years', 'has_lpg_conversion']


In [3]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"
)

X_train_enc = preprocessor.fit_transform(X_train)
X_val_enc = preprocessor.transform(X_val)

print("Encoded train:", X_train_enc.shape)
print("Encoded val:", X_val_enc.shape)


Encoded train: (48000, 60)
Encoded val: (12000, 60)


In [5]:
ebm = ExplainableBoostingClassifier(
    random_state=42,
    interactions=0,  # avoid parallel interaction-search issues
    n_jobs=1
)

ebm.fit(X_train_enc, y_train)


0,1,2
,feature_names,
,feature_types,
,max_bins,1024
,max_interaction_bins,64
,interactions,0
,exclude,
,validation_size,0.15
,outer_bags,14
,inner_bags,0
,learning_rate,0.015


In [6]:
y_val_prob = ebm.predict_proba(X_val_enc)[:, 1]
auc = roc_auc_score(y_val, y_val_prob)
print("EBM ROC-AUC:", round(auc, 4))

# Threshold from F1-max (objective)
from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_val, y_val_prob)
f1 = (2*prec*rec)/(prec+rec+1e-9)
best_idx = np.argmax(f1)
best_threshold_f1 = thr[best_idx]
print("Best F1 threshold:", float(best_threshold_f1))

# Report @ best threshold
y_pred = (y_val_prob >= best_threshold_f1).astype(int)
print(classification_report(y_val, y_pred, target_names=["No Claim","Claim"]))
print("Confusion:\n", confusion_matrix(y_val, y_pred))


EBM ROC-AUC: 0.6399
Best F1 threshold: 0.2133702032008401
              precision    recall  f1-score   support

    No Claim       0.84      0.47      0.60      8999
       Claim       0.31      0.72      0.44      3001

    accuracy                           0.54     12000
   macro avg       0.58      0.60      0.52     12000
weighted avg       0.71      0.54      0.56     12000

Confusion:
 [[4260 4739]
 [ 827 2174]]


In [None]:
#os.makedirs("../models", exist_ok=True)

#joblib.dump(
    #{
       ## "model": ebm,
       # "preprocessor": preprocessor,
       # "threshold_f1": float(best_threshold_f1)
    #},
    #"../models/ebm_bundle.pkl"
#)

#print("Saved: ../models/ebm_bundle.pkl")


Saved: ../models/ebm_bundle.pkl


In [12]:
import numpy as np
from sklearn.metrics import confusion_matrix

def best_threshold_by_cost(y_true, y_prob, fn_cost=5.0, fp_cost=1.0, grid_size=400):
    thresholds = np.linspace(0.01, 0.99, grid_size)
    costs = []
    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        cost = fn_cost * fn + fp_cost * fp
        costs.append(cost)
    best_idx = int(np.argmin(costs))
    return float(thresholds[best_idx]), float(costs[best_idx])

# Example (insurance: FN is worse than FP)
best_t_cost, best_cost = best_threshold_by_cost(y_val, y_val_prob, fn_cost=5, fp_cost=1)
print("Best cost-based threshold:", best_t_cost, " | cost:", best_cost)


Best cost-based threshold: 0.1549122807017544  | cost: 8440.0


In [13]:
from sklearn.metrics import precision_recall_curve
import numpy as np

def best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, thr = precision_recall_curve(y_true, y_prob)
    fbeta = (1 + beta**2) * (p * r) / ((beta**2 * p) + r + 1e-9)
    best_idx = np.argmax(fbeta)
    return float(thr[best_idx]), float(fbeta[best_idx])

best_t_f2, best_f2 = best_threshold_fbeta(y_val, y_val_prob, beta=2.0)
print("Best F2 threshold:", best_t_f2, " | F2:", best_f2)


Best F2 threshold: 0.130788513013815  | F2: 0.6313769945429859


In [14]:
from sklearn.utils.class_weight import compute_sample_weight

sample_w = compute_sample_weight(class_weight="balanced", y=y_train)

ebm.fit(X_train_enc, y_train, sample_weight=sample_w)


0,1,2
,feature_names,
,feature_types,
,max_bins,1024
,max_interaction_bins,64
,interactions,0
,exclude,
,validation_size,0.15
,outer_bags,14
,inner_bags,0
,learning_rate,0.015


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

def report_at_threshold(y_true, y_prob, t, title=""):
    y_pred = (y_prob >= t).astype(int)
    print("\n" + title)
    print("Threshold:", t)
    print(classification_report(y_true, y_pred, target_names=["No Claim","Claim"]))
    print("Confusion:\n", confusion_matrix(y_true, y_pred))

# 1) Cost-based threshold
t_cost, _ = best_threshold_by_cost(y_val, y_val_prob, fn_cost=5, fp_cost=1)
report_at_threshold(y_val, y_val_prob, t_cost, title="COST-BASED (FN=5, FP=1)")

# 2) F2 threshold
t_f2, _ = best_threshold_fbeta(y_val, y_val_prob, beta=2.0)
report_at_threshold(y_val, y_val_prob, t_f2, title="F2-OPTIMAL (recall-focused)")



COST-BASED (FN=5, FP=1)
Threshold: 0.1549122807017544
              precision    recall  f1-score   support

    No Claim       0.88      0.21      0.33      8999
       Claim       0.28      0.91      0.43      3001

    accuracy                           0.38     12000
   macro avg       0.58      0.56      0.38     12000
weighted avg       0.73      0.38      0.36     12000

Confusion:
 [[1849 7150]
 [ 258 2743]]

F2-OPTIMAL (recall-focused)
Threshold: 0.130788513013815
              precision    recall  f1-score   support

    No Claim       0.90      0.11      0.19      8999
       Claim       0.26      0.97      0.42      3001

    accuracy                           0.32     12000
   macro avg       0.58      0.54      0.30     12000
weighted avg       0.74      0.32      0.25     12000

Confusion:
 [[ 965 8034]
 [ 105 2896]]


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from interpret.glassbox import ExplainableBoostingClassifier
import joblib, os

df = pd.read_csv("../DataSets/processed/risk_dataset_model.csv")

TARGET = "had_claim_within_1_year"
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

ebm_model = ExplainableBoostingClassifier(
    random_state=42,
    max_bins=256,
    interactions=0,   
    n_jobs=1          
)

ebm_model.fit(X_train, y_train)

val_prob = ebm_model.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, val_prob)
print("EBM ROC-AUC:", round(auc, 4))

os.makedirs("../models", exist_ok=True)
joblib.dump(ebm_model, "../models/ebm_risk.pkl")
print("Saved ../models/ebm_risk.pkl")


EBM ROC-AUC: 0.6403
Saved ../models/ebm_risk.pkl


In [12]:
import numpy as np

# probabilities
y_val_prob = ebm_model.predict_proba(X_val)[:, 1]

# default threshold = 0.5
y_val_pred = (y_val_prob >= 0.5).astype(int)


In [13]:
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    balanced_accuracy_score
)

print("ROC-AUC           :", round(roc_auc_score(y_val, y_val_prob), 4))
print("Accuracy          :", round(accuracy_score(y_val, y_val_pred), 4))
print("Balanced Accuracy :", round(balanced_accuracy_score(y_val, y_val_pred), 4))
print("Precision         :", round(precision_score(y_val, y_val_pred), 4))
print("Recall            :", round(recall_score(y_val, y_val_pred), 4))
print("F1-score          :", round(f1_score(y_val, y_val_pred), 4))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))


ROC-AUC           : 0.6403
Accuracy          : 0.7495
Balanced Accuracy : 0.5136
Precision         : 0.4902
Recall            : 0.0417
F1-score          : 0.0768

Confusion Matrix:
[[8869  130]
 [2876  125]]

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      8999
           1       0.49      0.04      0.08      3001

    accuracy                           0.75     12000
   macro avg       0.62      0.51      0.47     12000
weighted avg       0.69      0.75      0.66     12000



In [14]:
import joblib, os

os.makedirs("../models", exist_ok=True)
joblib.dump(ebm_model, "../models/ebm_risk.pkl")
print("EBM model saved")

EBM model saved
