<a href="https://colab.research.google.com/github/sanuthit/Risk-Based-Motor-Insurance-Premium-Calculation-System-/blob/risk-model-development/accident_risk_DLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/drive')
root = "/content/drive/MyDrive"
print("MyDrive exists:", os.path.exists(root))
print("Top folders:", os.listdir(root)[:30])
DATA_DIR = "/content/drive/MyDrive/Data/Datasets"
print("DATA_DIR exists:", os.path.exists(DATA_DIR))
print(os.listdir(DATA_DIR)[:30])

df = pd.read_csv("/content/drive/MyDrive/Data/Datasets/risk_dataset_60000_toyota_suzuki_v2_cleaned.csv", encoding="utf-8")

DATA_PATH = "/content/drive/MyDrive/Data/Datasets/risk_dataset_60000_toyota_suzuki_v2_cleaned.csv"
df = pd.read_csv(DATA_PATH)

print(df.shape)
df.head()


Mounted at /content/drive
MyDrive exists: True
Top folders: ['Colab Notebooks', 'Data']
DATA_DIR exists: True
['premium_dataset_60000_v3_toyota_suzuki_full.csv', 'risk_dataset_60000.csv', 'risk_dataset_60000_toyota_suzuki_v2_cleaned.csv']
(60000, 49)


Unnamed: 0,policy_id,customer_id,driver_age,driver_gender,driver_occupation,years_of_driving_experience,member_automobile_assoc_ceylon,has_previous_motor_policy,ncb_percentage,accidents_last_3_years,...,approx_market_value,sum_insured,total_claim_amount_within_1_year,hard_flag_blacklist,driver_age_band,vehicle_age_band,risk_exposure_proxy,doc_missing_score,compliance_risk_score,ncb_validity_flag
0,P000001,C00002,35,M,Accountant,17,1,1,20,1,...,9375583,7691446,0,0,35–44,13+,Low,1,1,0
1,P000002,C00003,40,M,Unemployed,16,0,0,0,0,...,8789777,8210229,0,0,35–44,13+,Low,0,1,0
2,P000003,C00004,33,F,Businessman,8,0,1,10,4,...,5143262,4628639,680769,0,25–34,13+,Low,1,0,0
3,P000004,C00005,45,F,Farmer,27,0,1,35,0,...,7518522,7142596,0,0,45–59,13+,High,0,0,0
4,P000005,C00006,51,F,Businessman,18,0,1,10,1,...,6677872,6343978,0,0,45–59,4–7,High,1,1,0


In [3]:

RISK_FEATURES = [
    # Driver risk
    "driver_age",
    "driver_age_band",
    "driver_gender",
    "driver_occupation",
    "years_of_driving_experience",
    "member_automobile_assoc_ceylon",

    # Driving & claim history (inputs only)
    "has_previous_motor_policy",
    "accidents_last_3_years",
    "ncb_percentage",

    # Vehicle risk
    "vehicle_type",
    "vehicle_segment",
    "engine_capacity_cc",
    "fuel_type",
    "vehicle_age_years",
    "vehicle_age_band",
    "has_lpg_conversion",

    # Usage & exposure
    "vehicle_usage_type",
    "risk_exposure_proxy",
    "registration_district",
    "parking_type",

    # Behavioural / compliance proxy (optional but allowed)
    "doc_missing_score",
    "compliance_risk_score"
]


In [4]:
TARGET = "had_claim_within_1_year"

In [5]:
df_risk = df[RISK_FEATURES + [TARGET]].copy()

print(df_risk.shape)
df_risk.head()


(60000, 23)


Unnamed: 0,driver_age,driver_age_band,driver_gender,driver_occupation,years_of_driving_experience,member_automobile_assoc_ceylon,has_previous_motor_policy,accidents_last_3_years,ncb_percentage,vehicle_type,...,vehicle_age_years,vehicle_age_band,has_lpg_conversion,vehicle_usage_type,risk_exposure_proxy,registration_district,parking_type,doc_missing_score,compliance_risk_score,had_claim_within_1_year
0,35,35–44,M,Accountant,17,1,1,1,20,Car,...,13,13+,0,Private,Low,Jaffna,Street,1,1,0
1,40,35–44,M,Unemployed,16,0,0,0,0,Car,...,22,13+,0,Private,Low,Kandy,Garage,0,1,0
2,33,25–34,F,Businessman,8,0,1,4,10,Car,...,21,13+,0,Private,Low,Colombo,Street,1,0,1
3,45,45–59,F,Farmer,27,0,1,0,35,SUV,...,14,13+,0,Hire,High,Kandy,Garage,0,0,0
4,51,45–59,F,Businessman,18,0,1,1,10,Car,...,7,4–7,0,Hire,High,Galle,Garage,1,1,0


In [6]:
X = df_risk[RISK_FEATURES]
y = df_risk[TARGET]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (60000, 22)
y shape: (60000,)


In [7]:
from sklearn.model_selection import train_test_split

# 70% Train, 30% Temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

# Split temp into 15% Validation, 15% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42
)

print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)


Train: (42000, 22) (42000,)
Val:   (9000, 22) (9000,)
Test:  (9000, 22) (9000,)


In [8]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        # Ordinal bands
        ("ord", OrdinalEncoder(
            categories=[
                ["18-24", "25-34", "35-44", "45-59", "60+"],
                ["0-3", "4-7", "8-12", "13+"]
            ],
            handle_unknown="use_encoded_value",
            unknown_value=-1
        ), ["driver_age_band", "vehicle_age_band"]),

        # Nominal categories
        ("cat", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ), [
            "driver_gender",
            "driver_occupation",
            "vehicle_type",
            "vehicle_segment",
            "fuel_type",
            "vehicle_usage_type",
            "risk_exposure_proxy",
            "registration_district",
            "parking_type"
        ]),

        # Numeric / binary
        ("num", "passthrough", [
            "driver_age",
            "years_of_driving_experience",
            "member_automobile_assoc_ceylon",
            "has_previous_motor_policy",
            "accidents_last_3_years",
            "ncb_percentage",
            "engine_capacity_cc",
            "vehicle_age_years",
            "has_lpg_conversion",
            "doc_missing_score",
            "compliance_risk_score"
        ])
    ]
)


In [10]:
X_train_enc = preprocessor.fit_transform(X_train)
X_val_enc   = preprocessor.transform(X_val)
X_test_enc  = preprocessor.transform(X_test)

print(X_train_enc.shape)
print(X_val_enc.shape)
print(X_test_enc.shape)

(42000, 55)
(9000, 55)
(9000, 55)


# 01. FT-Transformer

In [11]:
!pip -q install pytorch-tabular torchmetrics

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m891.4/891.4 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m112.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [12]:
import pandas as pd

train_df = pd.concat([X_train, y_train], axis=1)
val_df   = pd.concat([X_val, y_val], axis=1)
test_df  = pd.concat([X_test, y_test], axis=1)

# Identify columns
categorical_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]
continuous_cols  = [c for c in X_train.columns if c not in categorical_cols]


In [15]:
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models.ft_transformer.config import FTTransformerConfig

data_config = DataConfig(
    target=[TARGET],
    continuous_cols=continuous_cols,
    categorical_cols=categorical_cols,
)

trainer_config = TrainerConfig(
    max_epochs=50,
    accelerator="auto",
    devices=1,
    early_stopping="valid_loss",
    early_stopping_patience=10,
    batch_size=1024,
    load_best=False
)

optimizer_config = OptimizerConfig()

model_config = FTTransformerConfig(
    task="classification",
    learning_rate=1e-3,
    num_heads=8,
    num_attn_blocks=4,
    attn_dropout=0.1,
    ff_dropout=0.1,
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    trainer_config=trainer_config,
    optimizer_config=optimizer_config,
)

tabular_model.fit(train=train_df, validation=val_df)


INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off
INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for classification task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = d

Output()

INFO:pytorch_tabular.tabular_model:Training the model completed


<pytorch_lightning.trainer.trainer.Trainer at 0x7e3868d68cb0>

In [16]:
from sklearn.metrics import roc_auc_score, f1_score, classification_report, confusion_matrix
import numpy as np

# Predict
pred_val  = tabular_model.predict(val_df)
pred_test = tabular_model.predict(test_df)

# USE CLASS-1 PROBABILITY
proba_col = f"{TARGET}_1_probability"
print("Using probability column:", proba_col)

val_proba  = pred_val[proba_col].values
test_proba = pred_test[proba_col].values

# ROC-AUC
print("VAL ROC-AUC (FT):", roc_auc_score(val_df[TARGET], val_proba))
print("TEST ROC-AUC (FT):", roc_auc_score(test_df[TARGET], test_proba))

# Best threshold on validation
thresholds = np.arange(0.05, 0.95, 0.01)
f1s = [f1_score(val_df[TARGET], (val_proba >= t).astype(int)) for t in thresholds]
best_t = thresholds[int(np.argmax(f1s))]
print("Best threshold:", best_t, "Best F1:", max(f1s))

# Final test metrics
test_pred = (test_proba >= best_t).astype(int)

print("Confusion matrix (TEST):")
print(confusion_matrix(test_df[TARGET], test_pred))

print("\nClassification report (TEST):")
print(classification_report(test_df[TARGET], test_pred, digits=4))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Using probability column: had_claim_within_1_year_1_probability
VAL ROC-AUC (FT): 0.6212785278898179
TEST ROC-AUC (FT): 0.633281650484651
Best threshold: 0.17000000000000004 Best F1: 0.2867294368449103
Confusion matrix (TEST):
[[4208 3531]
 [ 442  819]]

Classification report (TEST):
              precision    recall  f1-score   support

           0     0.9049    0.5437    0.6793      7739
           1     0.1883    0.6495    0.2919      1261

    accuracy                         0.5586      9000
   macro avg     0.5466    0.5966    0.4856      9000
weighted avg     0.8045    0.5586    0.6250      9000



In [17]:
risk_score_ft = (test_proba * 100).round().astype(int)
risk_score_ft[:20]

array([15, 31, 17, 23, 28,  7, 19, 13, 21, 17,  4,  6, 33, 24, 12, 10, 14,
       33, 20, 30])