In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
data = pd.read_csv('insurance.csv')

In [9]:
data

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92000,False,Jaipur,retired,High
1,36,101.1,1.83,34.28000,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64000,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34000,True,Mumbai,student,Medium
4,69,62.2,1.60,3.94000,True,Indore,retired,High
...,...,...,...,...,...,...,...,...
95,36,52.8,1.57,19.64000,False,Indore,business_owner,Low
96,26,113.8,1.54,34.01000,False,Delhi,private_job,Low
97,52,60.8,1.80,44.86000,False,Hyderabad,freelancer,Low
98,27,101.1,1.82,28.30000,False,Kolkata,business_owner,Low


In [10]:
data = data.drop(columns=['city'], axis=1)

In [11]:
data['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [None]:
# need to construct some column
# bmi
data['bmi'] = data['weight']/(data['height']**2)

In [13]:
# Lifestyle
def lifestyle_risk(row):
    if row["smoker"] and row['bmi'] > 30:
        return "high"
    elif row['smoker'] or row['bmi']> 27:
        return "medium"
    else:
        return "low"

In [14]:
data["lifestyle_risk"] = data.apply(lifestyle_risk, axis=1)

In [17]:
data.head()

Unnamed: 0,age,income_lpa,occupation,insurance_premium_category,bmi,lifestyle_risk
0,67,2.92,retired,High,49.227482,medium
1,36,34.28,freelancer,Low,30.189017,medium
2,39,36.64,freelancer,Low,21.118382,low
3,22,3.34,student,Medium,45.5359,high
4,69,3.94,retired,High,24.296875,medium


In [16]:
data = data.drop(columns=['weight', 'height','smoker'],axis=1)

In [18]:
x = data.drop(columns=['insurance_premium_category'], axis=1)
y = data['insurance_premium_category']

In [21]:
x.head()

Unnamed: 0,age,income_lpa,occupation,bmi,lifestyle_risk
0,67,2.92,retired,49.227482,medium
1,36,34.28,freelancer,30.189017,medium
2,39,36.64,freelancer,21.118382,low
3,22,3.34,student,45.5359,high
4,69,3.94,retired,24.296875,medium


In [22]:
# now we need to transform two columns.
cat_features = ['occupation', 'lifestyle_risk']
num_features = ['age', 'income_lpa', 'bmi']

In [23]:
# column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), cat_features),
        ("num", "passthrough", num_features)
    ]
)

In [25]:
# creating pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [27]:
# train and test split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state=1)
pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
# predict and evaluate
y_pred = pipeline.predict(x_test)
accuracy_score(y_test, y_pred)

0.6

In [31]:
# now saving the model
import pickle

model_path = 'model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(pipeline,f)