In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,accuracy_score
import numpy as np

In [4]:
df = pd.read_csv("insurance_data.csv")

In [5]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
16,66,70.2,1.59,0.61,False,Pune,retired,Medium
48,36,94.8,1.66,32.69,True,Chennai,unemployed,Medium
86,35,66.0,1.89,37.38,False,Hyderabad,freelancer,Low
46,42,83.0,1.57,25.57,True,Kolkata,unemployed,High
22,57,106.4,1.83,30.0,False,Chandigarh,government_job,Low


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         100 non-null    int64  
 1   weight                      100 non-null    float64
 2   height                      100 non-null    float64
 3   income_lpa                  100 non-null    float64
 4   smoker                      100 non-null    bool   
 5   city                        100 non-null    object 
 6   occupation                  100 non-null    object 
 7   insurance_premium_category  100 non-null    object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 5.7+ KB


In [8]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [41]:
df.describe()

Unnamed: 0,age,weight,height,income_lpa
count,100.0,100.0,100.0,100.0
mean,47.18,83.894,1.7132,18.4006
std,16.649312,21.020278,0.110205,16.067465
min,18.0,51.1,1.5,0.53
25%,34.75,63.65,1.61,2.8975
50%,47.0,82.3,1.73,14.122583
75%,61.0,101.3,1.81,30.1625
max,75.0,119.8,1.9,50.0


In [42]:
df_feat = df.copy()

In [43]:
# feature engineering

#feat 1: BMI
df_feat['bmi'] = df_feat['weight']/(df['height']**2)

In [44]:
#feat 2: Age group
def age_group(age):
  if age<25:
    return "young"
  elif age<45:
    return "adult"
  elif age<60:
    return "middle_age"
  return "senior"

df_feat['age_group'] = df_feat['age'].apply(age_group)

In [45]:
# feat 3: Lifestyle risk
def lifestyle_risk(row):
  if row['smoker'] and row["bmi"]>30:
    return "high"
  elif row['smoker'] and row['bmi']>27:
    return "medium"
  else:
    return "low"
  
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk,axis=1)

In [46]:

# feat 4: City Tier
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  return 3

df_feat['city_tier'] = df_feat['city'].apply(city_tier)

In [47]:
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,senior,low,2
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,adult,low,1
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult,low,2
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,young,high,1
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,senior,low,2


In [48]:
df_feat.drop(columns=["age","weight","height","smoker","city"],inplace=True)

In [49]:
df_feat.head()

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,2.92,retired,High,49.227482,senior,low,2
1,34.28,freelancer,Low,30.189017,adult,low,1
2,36.64,freelancer,Low,21.118382,adult,low,2
3,3.34,student,Medium,45.5359,young,high,1
4,3.94,retired,High,24.296875,senior,low,2


In [50]:
# Select features and target 
X = df_feat.drop('insurance_premium_category',axis=1)
y = df_feat['insurance_premium_category']

In [51]:
#defining the cat cols and num cols
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [52]:
# create columns transform to conert cat cols to OHE
preprocessor = ColumnTransformer(
  transformers=[
    ("cat",OneHotEncoder(),categorical_features),
    ("num","passthrough",numeric_features)
  ]
)

In [53]:

# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [54]:
Pipeline

sklearn.pipeline.Pipeline

In [55]:
# split data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [56]:
X_train

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
2,36.64,freelancer,21.118382,adult,low,2
73,2.22,retired,32.121628,senior,high,1
97,44.86,freelancer,18.765432,middle_age,low,1
62,35.67,business_owner,21.738481,adult,low,1
19,2.79,student,43.437500,young,high,2
...,...,...,...,...,...,...
75,45.07,unemployed,20.577355,middle_age,low,1
9,43.07,business_owner,24.858833,middle_age,low,1
72,3.08,retired,35.499527,senior,low,2
12,17.58,freelancer,30.046711,adult,high,2


In [57]:
y_train

2        Low
73      High
97       Low
62       Low
19      High
       ...  
75       Low
9        Low
72      High
12      High
37    Medium
Name: insurance_premium_category, Length: 80, dtype: object

In [58]:
pipeline.fit(X_train,y_train)

In [59]:
# Predict the values and evals

y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.75

In [61]:
X_test.sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
52,2.96,student,47.34472,young,low,2
69,6.034487,government_job,21.942857,middle_age,low,2
80,50.0,unemployed,34.350461,middle_age,low,2
44,50.0,private_job,30.078125,middle_age,high,2
32,50.0,private_job,31.495845,middle_age,low,2


In [62]:
# export the model

import pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path,"wb") as f:
  pickle.dump(pipeline,f)