# Creating the model 

# Import the libraries

In [199]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np

In [200]:
data = pd.read_csv("insurance.csv")
data.head(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [201]:
data_cp = data.copy()

In [202]:
data_cp['bmi'] = data_cp['height']/(data_cp['weight']**2)

In [203]:
def age_group(age):
    if age < 18:
        return "Kids"
    elif age > 18 and age < 30:
        return "Young"
    elif age > 30 and age < 60:
        return "Middle Age"
    else:
        return "Senior"

In [204]:
data_cp['Age_Group'] = data_cp['age'].apply(age_group)

In [205]:
def lifestyle(row):
    if row['smoker'] and row['bmi'] > 30:
        return "High"
    elif row['smoker'] and row['bmi'] < 30:
        return "Medium"
    else:
        return "Low"

In [206]:
data_cp['Lifestyle'] = data_cp.apply(lifestyle, axis=1)

In [207]:
data_cp.head(4)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,Age_Group,Lifestyle
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,0.000109,Senior,Low
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,0.000179,Middle Age,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,0.000508,Middle Age,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,0.00013,Young,Medium


In [208]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]
     

In [209]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3
     

In [210]:
data_cp["city_tier"] = data_cp["city"].apply(city_tier)

In [211]:
data_cp.head(2)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,Age_Group,Lifestyle,city_tier
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,0.000109,Senior,Low,2
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,0.000179,Middle Age,Low,1


In [212]:
dataset = data_cp.drop(['age','weight','height'],axis=1)
dataset.head(2)

Unnamed: 0,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,Age_Group,Lifestyle,city_tier
0,2.92,False,Jaipur,retired,High,0.000109,Senior,Low,2
1,34.28,False,Chennai,freelancer,Low,0.000179,Middle Age,Low,1


In [213]:
# Selecting features and target
X = dataset.drop(['insurance_premium_category','city','smoker'], axis=1)
y = dataset['insurance_premium_category']

In [214]:
X.head(3)

Unnamed: 0,income_lpa,occupation,bmi,Age_Group,Lifestyle,city_tier
0,2.92,retired,0.000109,Senior,Low,2
1,34.28,freelancer,0.000179,Middle Age,Low,1
2,36.64,freelancer,0.000508,Middle Age,Low,2


In [215]:
# print(y.head(2))
# print(pd.DataFrame(y.head(2)))

In [216]:
X.head(2)

Unnamed: 0,income_lpa,occupation,bmi,Age_Group,Lifestyle,city_tier
0,2.92,retired,0.000109,Senior,Low,2
1,34.28,freelancer,0.000179,Middle Age,Low,1


In [217]:
categorical_features = ['Age_Group','Lifestyle','city_tier','occupation']
numerical_features = ['income_lpa','bmi']

In [218]:
# Transformer
preprocessing = ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features),
        ('num','passthrough',numerical_features)
    ]
)

In [219]:
# Creating a pipeline
pipeline = Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('classifier',RandomForestClassifier(random_state=42))
])

In [220]:
# train and test split the data
X_train, X_test,y_train, y_test = train_test_split(X,y,random_state=42, test_size=0.20)
pipeline.fit(X_train,y_train)

In [221]:
#predict and evaluation
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.7

In [222]:
# Evaluation Metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Check if model supports probabilities
if hasattr(pipeline, 'predict_proba'):
    y_proba = pipeline.predict_proba(X_test)
    print("\nClass Probabilities (First 5 samples):")
    print(y_proba[:5])
else:
    print("Model doesn't support probability predictions")

Accuracy: 0.70

Classification Report:
              precision    recall  f1-score   support

        High       0.89      0.67      0.76        12
         Low       0.75      1.00      0.86         3
      Medium       0.43      0.60      0.50         5

    accuracy                           0.70        20
   macro avg       0.69      0.76      0.71        20
weighted avg       0.75      0.70      0.71        20


Confusion Matrix:
[[8 0 4]
 [0 3 0]
 [1 1 3]]

Class Probabilities (First 5 samples):
[[0.63 0.01 0.36]
 [0.1  0.48 0.42]
 [0.85 0.01 0.14]
 [0.04 0.41 0.55]
 [0.62 0.02 0.36]]


In [223]:
import pickle

model_name = "smoker_model.pkl"
with open(model_name,'wb') as file:
    pickle.dump(pipeline, file)