### ML Model Generation for Insurance Premium

In [40]:
%pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [262]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pickle


In [238]:
df = pd.read_csv('insurance_record.csv')

In [239]:
df.sample(5)

Unnamed: 0,age,weight,height,Annual_Income,smoker,city,Residency_status,occupation,insurance_premium_category
50,55,66.7,1.88,76381,True,Sengkang,PR,freelancer,Medium
31,39,51.1,1.83,35091,False,Pungol,Citizen,government_job,Low
2,39,56.8,1.64,251935,True,Outrampark,PR,retired,High
76,62,99.1,1.5,258871,True,Pasir Ris,PR,freelancer,High
98,27,101.1,1.82,213254,True,Bedok,Foreigner,business_owner,High


In [240]:
df['occupation'].unique()

array(['retired', 'business_owner', 'student', 'freelancer',
       'government_job', 'private_job', 'unemployed'], dtype=object)

In [241]:
df_feat = df.copy()

In [242]:
# 1st Feature Engineering: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"]**2)

In [243]:
# Age Group function
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"


In [244]:
# 2nd Feature Engineering: AGE_GROUP
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [245]:
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    if row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [246]:
# 3rd Feature Engineering: LIFESTYLE_RISK
# axis=1 = row-wise (go across columns within the same row).
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [247]:
def income_to_bucket(income):
    if pd.isna(income):
        return "unknown"
    if income < 50000:
        return "low"
    if income < 180000:
        return "medium"
    if income < 300000:
        return "high"
    else:
        return "very_high"

In [248]:
# 4th Feature Engineering: INCOME_BUCKET
df_feat["income_bucket"] = df_feat["Annual_Income"].apply(income_to_bucket)

In [249]:
df_feat.sample(5)

Unnamed: 0,age,weight,height,Annual_Income,smoker,city,Residency_status,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,income_bucket
23,35,70.3,1.78,85569,True,Pasir Ris,PR,freelancer,Medium,22.187855,adult,medium,medium
35,59,59.3,1.69,140455,True,Holland Village,Citizen,student,Medium,20.762578,middle_aged,medium,medium
58,72,95.9,1.79,233886,True,Outrampark,Foreigner,freelancer,High,29.930402,senior,medium,high
57,72,76.8,1.69,141330,True,Bugis,PR,student,Medium,26.889815,senior,medium,medium
55,47,75.7,1.73,20174,False,Boonlay,Citizen,unemployed,Low,25.293194,middle_aged,low,low


In [250]:
# Drop raw columns we don't want to use directly
#df_feat = df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'Annual_Income'])
df_feat = df_feat.drop(columns=['age', 'weight', 'height', 'smoker','Annual_Income'])

# Preview the engineered features + target
df_feat.sample(5)

Unnamed: 0,city,Residency_status,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,income_bucket
47,Holland Village,Citizen,government_job,Low,33.286625,middle_aged,medium,medium
91,Harbour Front,Foreigner,retired,High,38.675103,adult,high,high
90,Serangoon,Citizen,government_job,Low,21.09375,middle_aged,low,low
75,Woodland,PR,business_owner,High,20.577355,middle_aged,medium,very_high
5,Payalebar,Citizen,freelancer,Medium,22.826245,middle_aged,low,medium


In [251]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city", "Residency_status", "occupation", "income_bucket"]]
y = df_feat["insurance_premium_category"]

In [252]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city,Residency_status,occupation,income_bucket
0,49.227482,senior,high,Sengkang,PR,retired,high
1,30.189017,adult,high,Tampines,Foreigner,business_owner,high
2,21.118382,adult,medium,Outrampark,PR,retired,high
3,45.535900,young,high,Serangoon,Citizen,student,medium
4,24.296875,senior,medium,Outrampark,Foreigner,retired,high
...,...,...,...,...,...,...,...
105,32.800735,middle_aged,high,Payalebar,Citizen,freelancer,medium
106,21.860828,adult,low,Boonlay,Citizen,retired,low
107,27.767889,senior,medium,Redhill,PR,student,medium
108,31.176471,middle_aged,high,Boonlay,Citizen,student,medium


In [253]:
y

0        High
1        High
2        High
3      Medium
4        High
        ...  
105    Medium
106       Low
107    Medium
108    Medium
109    Medium
Name: insurance_premium_category, Length: 110, dtype: object

In [254]:
# Define categorical and numerical features
cat_features = ["age_group", "lifestyle_risk", "Residency_status", "occupation"]
# num_features = ["bmi", "Annual_Income"]
num_features = ["bmi"]


In [255]:
# Create column transformer for OneHotEncoding (OHE)
preprocessor = ColumnTransformer(
    transformers = [
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", "passthrough", num_features)
    ]
)

In [256]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [257]:
df_feat.sample(5)

Unnamed: 0,city,Residency_status,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,income_bucket
28,Serangoon,Citizen,retired,Low,31.584532,adult,medium,low
54,Pungol,Citizen,student,Medium,21.025423,senior,medium,medium
8,Holland Village,PR,freelancer,Medium,23.233456,senior,medium,medium
21,Bugis,Foreigner,retired,High,27.380671,senior,medium,high
98,Bedok,Foreigner,business_owner,High,30.521676,adult,high,high


In [258]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [259]:
# Predict and evaluate by checking the accuracy score
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9090909090909091

In [260]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city,Residency_status,occupation,income_bucket
81,31.866055,adult,high,Pasir Ris,PR,freelancer,high
69,21.942857,middle_aged,medium,Pungol,PR,freelancer,high
89,30.458274,young,high,Tampines,PR,retired,high
88,31.443698,middle_aged,high,Serangoon,PR,business_owner,high
17,31.176471,senior,high,Boonlay,PR,freelancer,medium


In [263]:
# Save the trained pipeline using pickle
pickle_model_path = "insurancemodel.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline,f)