In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report , accuracy_score
import numpy as np

In [3]:
df =pd.read_csv("insurance_data.csv")

In [5]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
7,41,77,175,15.0,No,Kolkata,Bank Manager,Medium
12,52,92,180,25.0,Yes,Chandigarh,Industrialist,High
1,32,75,172,12.0,Yes,Mumbai,Sales Manager,High
15,44,80,176,16.5,Yes,Surat,Operations Head,High
4,38,90,178,22.0,Yes,Pune,Consultant,High


In [6]:
#will apply feature engineering so create copy of data set 
df_feat=df.copy()


In [14]:
#FEATURE 1 :BMI
df_feat["bmi"]=df_feat["weight"]/(df_feat["height"]**2)

In [8]:
#FEATURE 2:AGE GROUP
def age_group(age):
    if age<25:
        return "young"
    elif age<45:
        return "adult"
    elif age<60:
        return "middle_aged"
    return "senior"

In [9]:
df_feat["age_group"]=df_feat["age"].apply(age_group)

In [15]:
 #FEATURE 3 LIFESTYLE
def life_style_risk(row):
    if row["smoker"] and row["bmi"]>30:
        return "high"
    elif row["smoker"] or row["bmi"]>27:
        return "medium"
    else:
        return "low"
#Haan, bilkul! Jab aapko ek se zyada columns (yaani pure row) ka data use
# karke koi naya feature nikalna hota hai, toh Pandas mein yahi standard tareeka
# hai.   

In [16]:
df_feat["lifestyle_risk"] = df_feat.apply(life_style_risk , axis=1)


In [18]:
df["city"]

0          Delhi
1         Mumbai
2      Bangalore
3      Hyderabad
4           Pune
5        Chennai
6         Jaipur
7        Kolkata
8      Ahmedabad
9          Noida
10       Gurgaon
11       Lucknow
12    Chandigarh
13        Indore
14        Bhopal
15         Surat
16        Nagpur
17         Patna
18    Coimbatore
19        Ranchi
Name: city, dtype: str

In [19]:
tier_1=["Delhi" ,"Mumbai"," Bangalore"," Hyderabad"," Pune"," Chennai"]
tier_2=["Ranchi","Kolkata","Surat"]


In [20]:
#FEATURE 4:CITY_TIER
def city_tier(city):
    if city in tier_1:
        return 1
    elif city in tier_2:
        return 2
    else:
        return 3

In [21]:
df_feat["city_tier"]=df_feat["city"].apply(city_tier)

In [22]:
df_feat.drop(columns=["age","weight","height","smoker","city"])


Unnamed: 0,income_lpa,occupation,insurance_premium_category,BMI,age_group,bmi,lifestyle_risk,city_tier
0,6.5,Software Engineer,Low,0.002204,adult,0.002204,medium,1
1,12.0,Sales Manager,High,0.002535,adult,0.002535,medium,1
2,18.5,Business Owner,Medium,0.002905,middle_aged,0.002905,medium,3
3,8.2,Teacher,Low,0.002353,adult,0.002353,medium,3
4,22.0,Consultant,High,0.002841,adult,0.002841,medium,3
5,10.5,Driver,High,0.00332,middle_aged,0.00332,medium,3
6,5.8,Graphic Designer,Low,0.002096,adult,0.002096,medium,3
7,15.0,Bank Manager,Medium,0.002514,adult,0.002514,medium,2
8,9.5,Marketing Executive,Medium,0.002416,adult,0.002416,medium,3
9,11.2,Data Analyst,Low,0.002462,adult,0.002462,medium,3


In [25]:
#select features and target
x=df_feat[["bmi","age_group","lifestyle_risk","city_tier","income_lpa","occupation"]]
y=df_feat["insurance_premium_category"]


In [26]:
x

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,0.002204,adult,medium,1,6.5,Software Engineer
1,0.002535,adult,medium,1,12.0,Sales Manager
2,0.002905,middle_aged,medium,3,18.5,Business Owner
3,0.002353,adult,medium,3,8.2,Teacher
4,0.002841,adult,medium,3,22.0,Consultant
5,0.00332,middle_aged,medium,3,10.5,Driver
6,0.002096,adult,medium,3,5.8,Graphic Designer
7,0.002514,adult,medium,2,15.0,Bank Manager
8,0.002416,adult,medium,3,9.5,Marketing Executive
9,0.002462,adult,medium,3,11.2,Data Analyst


In [27]:
y

0        Low
1       High
2     Medium
3        Low
4       High
5       High
6        Low
7     Medium
8     Medium
9        Low
10      High
11       Low
12      High
13    Medium
14       Low
15      High
16       Low
17      High
18    Medium
19       Low
Name: insurance_premium_category, dtype: str

In [28]:
#categorial_features and numeric features
categorical_features=["age_group","lifestyle_risk","city_tier","occupation"]
numeric_features=["bmi","income_lpa"]


In [38]:
#Create column transformer for OHE
preprocessor =ColumnTransformer(
    transformers=[
        ("cat",OneHotEncoder(handle_unknown='ignore'),categorical_features),
        ("num","passthrough",numeric_features)
    ]
)

In [39]:
#Create a pipleline with preprocessing and random forest classifier
pipeline=Pipeline(steps=[
    ("preprocessor" , preprocessor),
    ("classifier",RandomForestClassifier(random_state=42))
])

In [40]:
#split data and train model
x_train , x_test ,y_train , y_test=train_test_split(x,y,test_size=0.2,random_state=1)
pipeline.fit(x_train , y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [41]:
#split and train model

y_pred = pipeline.predict(x_test)
accuracy_score(y_test ,y_pred)

0.75

ValueError: Cannot take a larger sample than population when 'replace=False'