In [26]:
# holy grail
import pandas as pd
import numpy as np

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

import pickle

# warnings
import warnings
warnings.filterwarnings(action="ignore", category=Warning)

In [2]:
# import data
df = pd.read_csv("data/insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         100 non-null    int64  
 1   weight                      100 non-null    float64
 2   height                      100 non-null    float64
 3   income_lpa                  100 non-null    float64
 4   smoker                      100 non-null    bool   
 5   city                        100 non-null    object 
 6   occupation                  100 non-null    object 
 7   insurance_premium_category  100 non-null    object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 5.7+ KB


In [5]:
df.isnull().sum()

age                           0
weight                        0
height                        0
income_lpa                    0
smoker                        0
city                          0
occupation                    0
insurance_premium_category    0
dtype: int64

In [6]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [7]:
df['occupation'].value_counts(normalize=True)

occupation
retired           0.26
unemployed        0.15
government_job    0.14
student           0.12
freelancer        0.11
business_owner    0.11
private_job       0.11
Name: proportion, dtype: float64

### Feature Engineering

In [8]:
df_feat = df.copy(deep=True)

In [9]:
## Feature 1: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

In [10]:
## Faeture 2: Age Group
def age_group(age):
    if age <= 25:
        return "young"
    elif 26 <= age <= 45:
        return "adult"
    elif 46 <= age <= 65:
        return "middle_aged"
    else:
        return "senior"
    
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [11]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [12]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [13]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]

tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [14]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3
     

df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [15]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
56,2.86,student,42.414152,young,high,1,Medium
55,24.93,unemployed,25.293194,middle_aged,low,1,Low
85,34.66,private_job,14.857209,adult,low,1,Low
35,43.28,private_job,20.762578,middle_aged,medium,2,Medium
45,18.39,unemployed,33.466667,middle_aged,medium,2,High


### Model Building

In [16]:
# Select features and target variable
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat['insurance_premium_category']

In [17]:
X.head()

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92,retired
1,30.189017,adult,medium,1,34.28,freelancer
2,21.118382,adult,low,2,36.64,freelancer
3,45.5359,young,high,1,3.34,student
4,24.296875,senior,medium,2,3.94,retired


In [18]:
y.head()

0      High
1       Low
2       Low
3    Medium
4      High
Name: insurance_premium_category, dtype: object

In [19]:
# select categorical and numerical features
categorical_features = ["age_group", "lifestyle_risk", "city_tier", "occupation"]
numerical_features = ["bmi", "income_lpa"]

In [20]:
# Create column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(sparse_output=False), categorical_features),
        ("num", "passthrough", numerical_features)
        ], remainder='passthrough', verbose_feature_names_out=False
)
preprocessor.set_output(transform="pandas")

In [21]:
# create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [22]:
# split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# train the model
pipeline.fit(X_train, y_train)

In [24]:
# predict and evaluate the model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

        High       0.86      0.50      0.63        12
         Low       0.60      1.00      0.75         3
      Medium       0.25      0.40      0.31         5

    accuracy                           0.55        20
   macro avg       0.57      0.63      0.56        20
weighted avg       0.67      0.55      0.57        20

Accuracy: 0.55


In [25]:
X_test.head()

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
83,24.338934,middle_aged,medium,3,2.16,retired
53,29.598247,adult,medium,1,30.0,government_job
70,36.694215,senior,medium,2,0.57,retired
45,33.466667,middle_aged,medium,2,18.39,unemployed
44,30.078125,middle_aged,high,2,50.0,private_job


In [27]:
# export the model using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)