<a href="https://colab.research.google.com/github/sanuthit/Risk-Based-Motor-Insurance-Premium-Calculation-System-/blob/risk-model-development/accident_risk_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Risk-Based Motor Insurance Premium + Premium Calculation** - Risk Model

In [53]:
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/drive')
root = "/content/drive/MyDrive"
print("MyDrive exists:", os.path.exists(root))
print("Top folders:", os.listdir(root)[:30])
DATA_DIR = "/content/drive/MyDrive/Data/Datasets"
print("DATA_DIR exists:", os.path.exists(DATA_DIR))
print(os.listdir(DATA_DIR)[:30])

df = pd.read_csv("/content/drive/MyDrive/Data/Datasets/risk_dataset_60000_toyota_suzuki_v2_cleaned.csv", encoding="utf-8")

DATA_PATH = "/content/drive/MyDrive/Data/Datasets/risk_dataset_60000_toyota_suzuki_v2_cleaned.csv"
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
MyDrive exists: True
Top folders: ['Colab Notebooks', 'Data']
DATA_DIR exists: True
['premium_dataset_60000_v3_toyota_suzuki_full.csv', 'risk_dataset_60000.csv', 'risk_dataset_60000_toyota_suzuki_v2_cleaned.csv']
(60000, 49)


Unnamed: 0,policy_id,customer_id,driver_age,driver_gender,driver_occupation,years_of_driving_experience,member_automobile_assoc_ceylon,has_previous_motor_policy,ncb_percentage,accidents_last_3_years,...,approx_market_value,sum_insured,total_claim_amount_within_1_year,hard_flag_blacklist,driver_age_band,vehicle_age_band,risk_exposure_proxy,doc_missing_score,compliance_risk_score,ncb_validity_flag
0,P000001,C00002,35,M,Accountant,17,1,1,20,1,...,9375583,7691446,0,0,35–44,13+,Low,1,1,0
1,P000002,C00003,40,M,Unemployed,16,0,0,0,0,...,8789777,8210229,0,0,35–44,13+,Low,0,1,0
2,P000003,C00004,33,F,Businessman,8,0,1,10,4,...,5143262,4628639,680769,0,25–34,13+,Low,1,0,0
3,P000004,C00005,45,F,Farmer,27,0,1,35,0,...,7518522,7142596,0,0,45–59,13+,High,0,0,0
4,P000005,C00006,51,F,Businessman,18,0,1,10,1,...,6677872,6343978,0,0,45–59,4–7,High,1,1,0


1. Define the FINAL risk feature list

In [54]:

RISK_FEATURES = [
    # Driver risk
    "driver_age",
    "driver_age_band",
    "driver_gender",
    "driver_occupation",
    "years_of_driving_experience",
    "member_automobile_assoc_ceylon",

    # Driving & claim history (inputs only)
    "has_previous_motor_policy",
    "accidents_last_3_years",
    "ncb_percentage",

    # Vehicle risk
    "vehicle_type",
    "vehicle_segment",
    "engine_capacity_cc",
    "fuel_type",
    "vehicle_age_years",
    "vehicle_age_band",
    "has_lpg_conversion",

    # Usage & exposure
    "vehicle_usage_type",
    "risk_exposure_proxy",
    "registration_district",
    "parking_type",

    # Behavioural / compliance proxy (optional but allowed)
    "doc_missing_score",
    "compliance_risk_score"
]


2. Define the target

In [55]:
TARGET = "had_claim_within_1_year"

In [56]:
df_risk = df[RISK_FEATURES + [TARGET]].copy()

print(df_risk.shape)
df_risk.head()


(60000, 23)


Unnamed: 0,driver_age,driver_age_band,driver_gender,driver_occupation,years_of_driving_experience,member_automobile_assoc_ceylon,has_previous_motor_policy,accidents_last_3_years,ncb_percentage,vehicle_type,...,vehicle_age_years,vehicle_age_band,has_lpg_conversion,vehicle_usage_type,risk_exposure_proxy,registration_district,parking_type,doc_missing_score,compliance_risk_score,had_claim_within_1_year
0,35,35–44,M,Accountant,17,1,1,1,20,Car,...,13,13+,0,Private,Low,Jaffna,Street,1,1,0
1,40,35–44,M,Unemployed,16,0,0,0,0,Car,...,22,13+,0,Private,Low,Kandy,Garage,0,1,0
2,33,25–34,F,Businessman,8,0,1,4,10,Car,...,21,13+,0,Private,Low,Colombo,Street,1,0,1
3,45,45–59,F,Farmer,27,0,1,0,35,SUV,...,14,13+,0,Hire,High,Kandy,Garage,0,0,0
4,51,45–59,F,Businessman,18,0,1,1,10,Car,...,7,4–7,0,Hire,High,Galle,Garage,1,1,0


3. Check

In [57]:
#missng values
df_risk.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
driver_age,0
driver_age_band,0
driver_gender,0
driver_occupation,0
years_of_driving_experience,0
member_automobile_assoc_ceylon,0
has_previous_motor_policy,0
accidents_last_3_years,0
ncb_percentage,0
vehicle_type,0


In [58]:
#Target balance
df_risk[TARGET].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
had_claim_within_1_year,Unnamed: 1_level_1
0,0.859833
1,0.140167


# 4. Split X and y

In [59]:
X = df_risk[RISK_FEATURES]
y = df_risk[TARGET]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (60000, 22)
y shape: (60000,)


In [60]:
from sklearn.model_selection import train_test_split

# 70% Train, 30% Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

# Split temp into 15% Validation, 15% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)


Train: (42000, 22) (42000,)
Val:   (9000, 22) (9000,)
Test:  (9000, 22) (9000,)


01. Encoding pipeline

In [61]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [62]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        # Ordinal bands
        ("ord", OrdinalEncoder(
            categories=[
                ["18-24", "25-34", "35-44", "45-59", "60+"],
                ["0-3", "4-7", "8-12", "13+"]
            ],
            handle_unknown="use_encoded_value",
            unknown_value=-1
        ), ["driver_age_band", "vehicle_age_band"]),

        # Nominal categories
        ("cat", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ), [
            "driver_gender",
            "driver_occupation",
            "vehicle_type",
            "vehicle_segment",
            "fuel_type",
            "vehicle_usage_type",
            "risk_exposure_proxy",
            "registration_district",
            "parking_type"
        ]),

        # Numeric / binary
        ("num", "passthrough", [
            "driver_age",
            "years_of_driving_experience",
            "member_automobile_assoc_ceylon",
            "has_previous_motor_policy",
            "accidents_last_3_years",
            "ncb_percentage",
            "engine_capacity_cc",
            "vehicle_age_years",
            "has_lpg_conversion",
            "doc_missing_score",
            "compliance_risk_score"
        ])
    ]
)


In [73]:
X_train_enc = preprocessor.fit_transform(X_train)
X_val_enc   = preprocessor.transform(X_val)
X_test_enc  = preprocessor.transform(X_test)

print(X_train_enc.shape)
print(X_val_enc.shape)
print(X_test_enc.shape)

(42000, 55)
(9000, 55)
(9000, 55)


In [74]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.array([0, 1])
weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train      # TRAINING data only
)

class_weight = {0: weights[0], 1: weights[1]}

2. Define categorical columns (CatBoost)

In [42]:
CATEGORICAL_COLS = [
    "driver_gender",
    "driver_occupation",
    "vehicle_type",
    "vehicle_segment",
    "fuel_type",
    "vehicle_usage_type",
    "risk_exposure_proxy",
    "registration_district",
    "parking_type",
    "driver_age_band",
    "vehicle_age_band"
]

3. evaluater

In [43]:
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, average_precision_score

def evaluate_with_threshold(y_true, proba, name="model"):
    roc = roc_auc_score(y_true, proba)
    pr_auc = average_precision_score(y_true, proba)

    thresholds = np.arange(0.05, 0.95, 0.01)
    f1s = []
    for t in thresholds:
        pred = (proba >= t).astype(int)
        f1s.append(f1_score(y_true, pred))
    best_t = thresholds[int(np.argmax(f1s))]
    best_f1 = max(f1s)

    pred_best = (proba >= best_t).astype(int)
    prec = precision_score(y_true, pred_best, zero_division=0)
    rec  = recall_score(y_true, pred_best, zero_division=0)

    return {
        "model": name,
        "roc_auc": roc,
        "pr_auc": pr_auc,
        "best_threshold": float(best_t),
        "f1_at_best_t": float(best_f1),
        "precision_at_best_t": float(prec),
        "recall_at_best_t": float(rec),
    }


# 5. Train Models

**5.1 CatBoost**

In [28]:
!pip -q install lightgbm xgboost catboost interpret


In [19]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=[0.5815, 3.5672],
    random_seed=42,
    verbose=200
)

cat.fit(
    X_train, y_train,
    cat_features=CATEGORICAL_COLS,
    eval_set=(X_val, y_val),
    use_best_model=True
)

val_proba_cat = cat.predict_proba(X_val)[:, 1]

0:	test: 0.6114522	best: 0.6114522 (0)	total: 312ms	remaining: 10m 24s
200:	test: 0.6204124	best: 0.6238185 (80)	total: 57.2s	remaining: 8m 31s
400:	test: 0.6106703	best: 0.6238185 (80)	total: 1m 53s	remaining: 7m 31s
600:	test: 0.6004749	best: 0.6238185 (80)	total: 2m 38s	remaining: 6m 7s
800:	test: 0.5911873	best: 0.6238185 (80)	total: 3m 23s	remaining: 5m 5s
1000:	test: 0.5871122	best: 0.6238185 (80)	total: 4m 9s	remaining: 4m 8s
1200:	test: 0.5814633	best: 0.6238185 (80)	total: 4m 57s	remaining: 3m 18s
1400:	test: 0.5780762	best: 0.6238185 (80)	total: 5m 42s	remaining: 2m 26s
1600:	test: 0.5752910	best: 0.6238185 (80)	total: 6m 27s	remaining: 1m 36s
1800:	test: 0.5728769	best: 0.6238185 (80)	total: 7m 11s	remaining: 47.7s
1999:	test: 0.5684575	best: 0.6238185 (80)	total: 7m 56s	remaining: 0us

bestTest = 0.6238185275
bestIteration = 80

Shrink model to first 81 iterations.


In [20]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

val_proba = cat.predict_proba(X_val)[:, 1]

print("Validation ROC-AUC:", roc_auc_score(y_val, val_proba))

Validation ROC-AUC: 0.6238185274556299


In [21]:
import numpy as np
from sklearn.metrics import f1_score

thresholds = np.arange(0.05, 0.95, 0.01)
f1s = []

for t in thresholds:
    preds = (val_proba >= t).astype(int)
    f1s.append(f1_score(y_val, preds))

best_t = thresholds[np.argmax(f1s)]
print("Best threshold:", best_t, "Best F1:", max(f1s))


Best threshold: 0.49000000000000005 Best F1: 0.28833455612619224


In [22]:
test_proba = cat.predict_proba(X_test)[:, 1]
test_pred  = (test_proba >= best_t).astype(int)

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
print("TEST ROC-AUC:", roc_auc_score(y_test, test_proba))
print(confusion_matrix(y_test, test_pred))
print(classification_report(y_test, test_pred, digits=4))


TEST ROC-AUC: 0.6319649008866695
[[4378 3361]
 [ 480  781]]
              precision    recall  f1-score   support

           0     0.9012    0.5657    0.6951      7739
           1     0.1886    0.6193    0.2891      1261

    accuracy                         0.5732      9000
   macro avg     0.5449    0.5925    0.4921      9000
weighted avg     0.8013    0.5732    0.6382      9000



In [23]:
risk_score = (test_proba * 100).round().astype(int)
risk_score[:20]


array([39, 72, 57, 54, 62, 29, 47, 45, 48, 49, 23, 25, 67, 57, 44, 39, 45,
       65, 50, 63])

**5.2 LightGBM**

In [87]:
import lightgbm as lgb
import numpy as np
import pandas as pd

feature_names = preprocessor.get_feature_names_out()

X_train_enc_np = np.asarray(X_train_enc)
X_val_enc_np   = np.asarray(X_val_enc)
X_test_enc_np  = np.asarray(X_test_enc)

feature_names = preprocessor.get_feature_names_out()
X_train_enc_df = pd.DataFrame(X_train_enc_np, columns=feature_names)
X_val_enc_df   = pd.DataFrame(X_val_enc_np, columns=feature_names)
X_test_enc_df  = pd.DataFrame(X_test_enc_np, columns=feature_names)

lgbm = lgb.LGBMClassifier(
    n_estimators=8000,
    learning_rate=0.01,
    min_child_samples=80,
num_leaves=63,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    class_weight=class_weight,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(
    X_train_enc_df, y_train,
    eval_set=[(X_val_enc_df, y_val)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(100)]
)

val_proba_lgbm = lgbm.predict_proba(X_val_enc_df)[:, 1]



Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.622421	valid_0's binary_logloss: 0.676529


In [91]:
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix
import numpy as np

print("Validation ROC-AUC:", roc_auc_score(y_val_np, val_proba))

thresholds = np.arange(0.05, 0.95, 0.01)
f1s = [f1_score(y_val_np, (val_proba >= t).astype(int)) for t in thresholds]
best_t = thresholds[int(np.argmax(f1s))]
print("Best threshold:", best_t, "Best F1:", max(f1s))

test_pred = (test_proba >= best_t).astype(int)
print("TEST ROC-AUC:", roc_auc_score(y_test_np, test_proba))
print(confusion_matrix(y_test_np, test_pred))
print(classification_report(y_test_np, test_pred, digits=4))


Validation ROC-AUC: 0.605010918188748
Best threshold: 0.15000000000000002 Best F1: 0.2733538642029623
TEST ROC-AUC: 0.6216261621852264
[[3211 4528]
 [ 307  954]]
              precision    recall  f1-score   support

           0     0.9127    0.4149    0.5705      7739
           1     0.1740    0.7565    0.2830      1261

    accuracy                         0.4628      9000
   macro avg     0.5434    0.5857    0.4267      9000
weighted avg     0.8092    0.4628    0.5302      9000



**5.3 XGBoost**

In [103]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# Use numpy arrays (or df.values) for DMatrix
dtrain = xgb.DMatrix(X_train_enc_df.values, label=y_train.values)
dval   = xgb.DMatrix(X_val_enc_df.values,   label=y_val.values)

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.03,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "scale_pos_weight": scale_pos_weight,
    "seed": 42,
    "tree_method": "hist"
}

xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=3000,
    evals=[(dval, "val")],
    early_stopping_rounds=100,
    verbose_eval=100
)

# Probabilities on validation
val_proba_xgb = model.predict(dval)
print("VAL AUC:", roc_auc_score(y_val, val_proba_xgb))


[0]	val-auc:0.59923
[100]	val-auc:0.61969
[133]	val-auc:0.61747
VAL AUC: 0.6174748775159861


In [104]:
val_proba_xgb  = xgb_model.predict(dval)
test_proba_xgb = xgb_model.predict(dtest)

In [105]:
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix
import numpy as np

print("Validation ROC-AUC:", roc_auc_score(y_val, val_proba_xgb))

thresholds = np.arange(0.05, 0.95, 0.01)
f1s = [f1_score(y_val, (val_proba_xgb >= t).astype(int)) for t in thresholds]
best_t = thresholds[np.argmax(f1s)]

print("Best threshold:", best_t)

test_pred = (test_proba_xgb >= best_t).astype(int)
print("TEST ROC-AUC:", roc_auc_score(y_test, test_proba_xgb))
print(confusion_matrix(y_test, test_pred))
print(classification_report(y_test, test_pred, digits=4))


Validation ROC-AUC: 0.6174748775159861
Best threshold: 0.4800000000000001
TEST ROC-AUC: 0.6256941499120954
[[4298 3441]
 [ 483  778]]
              precision    recall  f1-score   support

           0     0.8990    0.5554    0.6866      7739
           1     0.1844    0.6170    0.2839      1261

    accuracy                         0.5640      9000
   macro avg     0.5417    0.5862    0.4853      9000
weighted avg     0.7989    0.5640    0.6302      9000



**5.4 EBM**

In [106]:
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier(
    random_state=42,
    max_bins=256,
    interactions=10
)

ebm.fit(X_train_enc, y_train)

val_proba_ebm = ebm.predict_proba(X_val_enc)[:, 1]


In [107]:
from sklearn.metrics import roc_auc_score, f1_score
import numpy as np

print("Validation ROC-AUC (EBM):",
      roc_auc_score(y_val, val_proba_ebm))

thresholds = np.arange(0.05, 0.95, 0.01)
f1s = [f1_score(y_val, (val_proba_ebm >= t).astype(int)) for t in thresholds]
best_t = thresholds[np.argmax(f1s)]

print("Best threshold:", best_t)
print("Best F1:", max(f1s))


Validation ROC-AUC (EBM): 0.6216323296354992
Best threshold: 0.13
Best F1: 0.28398687165313524


In [108]:
test_proba_ebm = ebm.predict_proba(X_test_enc)[:, 1]

print("TEST ROC-AUC (EBM):",
      roc_auc_score(y_test, test_proba_ebm))


TEST ROC-AUC (EBM): 0.6322572500386572


In [109]:
from interpret import show

show(ebm.explain_global())


In [111]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

val_proba_ebm  = ebm.predict_proba(X_val_enc)[:, 1]
test_proba_ebm = ebm.predict_proba(X_test_enc)[:, 1]

best_t = 0.13

val_pred_ebm  = (val_proba_ebm  >= best_t).astype(int)
test_pred_ebm = (test_proba_ebm >= best_t).astype(int)

print("VAL ROC-AUC:", roc_auc_score(y_val, val_proba_ebm))
print("TEST ROC-AUC:", roc_auc_score(y_test, test_proba_ebm))

print("Confusion matrix (TEST):")
print(confusion_matrix(y_test, test_pred_ebm))

print("\nClassification report (TEST):")
print(classification_report(y_test, test_pred_ebm, digits=4))


VAL ROC-AUC: 0.6216323296354992
TEST ROC-AUC: 0.6322572500386572
Confusion matrix (TEST):
[[4143 3596]
 [ 436  825]]

Classification report (TEST):
              precision    recall  f1-score   support

           0     0.9048    0.5353    0.6727      7739
           1     0.1866    0.6542    0.2904      1261

    accuracy                         0.5520      9000
   macro avg     0.5457    0.5948    0.4815      9000
weighted avg     0.8042    0.5520    0.6191      9000

