In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
df = pd.read_csv("A:\\Aniket_Scidentai\\MLOPS\\FastAPI_Tutorials\\insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [4]:
df_temp = df

In [5]:
df_temp["BMI"] = df_temp["weight"]/(df_temp["height"]**2)

In [6]:
def age_group(age):
    if age < 25:
        return "young"
    elif age >= 25 and age < 45:
        return "Adult"
    elif age >= 45 and age < 60:
        return "Middle_Aged"
    else:
        return "senior"

In [7]:
df_temp["age_group"] = df_temp["age"].apply(age_group)

In [8]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["BMI"] > 30:
        return "high"
    elif row["smoker"] or row["BMI"] > 27:
        return "medium"
    else:
        return "low"

In [9]:
df_temp["lifestyle_risk"] = df_temp.apply(lifestyle_risk, axis=1)

In [10]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [11]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [12]:
df_temp["city_tier"] = df_temp["city"].apply(city_tier)

In [13]:
df_temp.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'BMI', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)
     

Unnamed: 0,income_lpa,occupation,BMI,age_group,lifestyle_risk,city_tier,insurance_premium_category
61,24.05,unemployed,36.281179,Adult,high,2,High
79,30.0,government_job,22.723537,Adult,medium,2,Low
53,30.0,government_job,29.598247,Adult,medium,1,Medium
27,34.33,private_job,35.159702,Middle_Aged,medium,2,Medium
43,1.56,retired,29.308163,senior,medium,1,Medium


In [14]:
X = df_temp[["BMI", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_temp["insurance_premium_category"]

In [15]:
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["BMI", "income_lpa"]

In [16]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [17]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [19]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [20]:
X_test.sample(5)

Unnamed: 0,BMI,age_group,lifestyle_risk,city_tier,income_lpa,occupation
81,31.866055,Adult,high,2,22.19,freelancer
36,21.713266,senior,low,1,0.53,retired
51,38.827923,Middle_Aged,high,2,28.95,private_job
93,23.199416,young,low,2,1.28,student
56,42.414152,young,high,1,2.86,student


In [21]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)

In [22]:
df_temp["occupation"].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)