In [2]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [19]:
df=pd.read_csv('insurance_data_updated.csv')

In [20]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
488,22,62.4,1.73,47.13,True,Bangalore,Teacher,High
320,33,88.8,1.75,23.48,False,Delhi,Designer,Medium
82,45,62.8,1.68,43.27,True,Bhopal,Engineer,High
425,52,112.0,1.63,35.02,False,Mumbai,Lawyer,High
100,41,107.5,1.81,20.63,False,Patna,Doctor,Medium


In [48]:
df['occupation'].unique()

array(['Doctor', 'Teacher', 'Designer', 'Manager', 'Scientist',
       'Engineer', 'Lawyer'], dtype=object)

In [21]:
df_feat=df.copy()


In [22]:
df_feat['bmi']=df_feat['weight']/(df_feat['height']**2)

In [23]:
def age_group(age):
    if age<25:
        return 'young'
    elif age<45:
        return 'adult'
    elif age<60:
        return 'middle-aged'
    return 'senior'

In [24]:
df_feat['age_group']=df_feat['age'].apply(age_group)

In [25]:
# feature 3 lifestyle risk 
def lifestyle_risk(row):
    if row['smoker'] and row['bmi']>30:
        return 'high'
    elif row['smoker'] and row['bmi']>27:
        return 'medium'
    else :
        return 'low'

In [26]:
df_feat['lifestyle_risk']=df_feat.apply(lifestyle_risk,axis=1)

In [27]:

tier_1_cities=["Delhi", "Mumbai", "Bangalore", "Hyderabad", "Chennai", "Pune", "Kolkata"]
tier_2_cities=["Jaipur", "Ahmedabad", "Lucknow", "Indore", "Surat"]

In [None]:
# feature selection 4
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3
    

In [29]:
df_feat['city_tier']=df_feat['city'].apply(city_tier)

In [31]:
df_feat.drop(columns=['age','weight','height','smoker','city'])[['income_lpa','occupation','bmi','age_group','lifestyle_risk','city_tier','insurance_premium_category']]

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
0,48.82,Doctor,41.339529,adult,high,1,High
1,24.02,Teacher,39.262792,middle-aged,high,1,High
2,38.09,Designer,25.832200,young,low,1,High
3,19.97,Teacher,25.012245,middle-aged,low,3,Medium
4,18.56,Manager,17.160917,senior,low,1,High
...,...,...,...,...,...,...,...
495,18.04,Scientist,30.983892,adult,low,3,Medium
496,26.41,Teacher,31.244993,young,high,1,High
497,7.40,Manager,33.024584,senior,high,2,High
498,15.66,Designer,30.354246,adult,high,1,High


In [34]:
# select features and target
x=df_feat[['bmi','age_group','lifestyle_risk','city_tier','income_lpa','occupation']]
y=df_feat['insurance_premium_category']

In [35]:
x

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,41.339529,adult,high,1,48.82,Doctor
1,39.262792,middle-aged,high,1,24.02,Teacher
2,25.832200,young,low,1,38.09,Designer
3,25.012245,middle-aged,low,3,19.97,Teacher
4,17.160917,senior,low,1,18.56,Manager
...,...,...,...,...,...,...
495,30.983892,adult,low,3,18.04,Scientist
496,31.244993,young,high,1,26.41,Teacher
497,33.024584,senior,high,2,7.40,Manager
498,30.354246,adult,high,1,15.66,Designer


In [36]:
y

0        High
1        High
2        High
3      Medium
4        High
        ...  
495    Medium
496      High
497      High
498      High
499      High
Name: insurance_premium_category, Length: 500, dtype: object

In [37]:
# define categorical and numerical features
categorical_features=['age_group','lifestyle_risk','occupation','city_tier']
numerical_features=['bmi','income_lpa']

In [38]:
# create coloumn transformer for OHE
preprocessor=ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(),categorical_features),
        ('num','passthrough',numerical_features)
    ]
)

In [41]:
# create a pipeline with preprocessing and random forest classifier
pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

In [44]:
# split data and train model 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)
pipeline.fit(x_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [45]:
y_pred=pipeline.predict(x_test)
accuracy_score(y_test,y_pred)

0.86

In [46]:
x_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
189,32.981947,adult,high,2,11.37,Scientist
485,31.126501,adult,low,1,15.2,Lawyer
495,30.983892,adult,low,3,18.04,Scientist
192,42.232951,middle-aged,high,1,9.83,Manager
306,39.336645,middle-aged,high,2,25.26,Scientist


In [47]:
import pickle

# save the trained pipeline using pickle 
pickle_model_path='model.pkl'
with open(pickle_model_path,'wb') as f:
    pickle.dump(pipeline,f)