In [1]:
import pandas as pd
import numpy as np

# Define parameters
n_rows = 1000
pakistan_cities = ['Karachi', 'Lahore', 'Faisalabad', 'Rawalpindi', 'Multan', 'Hyderabad', 'Quetta', 'Peshawar', 'Sialkot', 'Sargodha', 'Bahawalpur', 'Larkana']
occupations = ['student', 'retired', 'freelancer', 'government_job', 'business_owner', 'private_job', 'unemployed']
premium_categories = ['Low', 'Medium', 'High']

# Generate data
data = {
    'age': np.random.randint(18, 76, n_rows),
    'weight': np.clip(np.random.normal(84, 21, n_rows), 50, 120),
    'height': np.clip(np.random.normal(1.71, 0.11, n_rows), 1.5, 1.9),
    'income_lpa': np.zeros(n_rows),
    'smoker': np.random.choice([True, False], n_rows, p=[0.43, 0.57]),
    'city': np.random.choice(pakistan_cities, n_rows),
    'occupation': np.zeros(n_rows, dtype=object),
    'insurance_premium_category': np.random.choice(premium_categories, n_rows, p=[0.34, 0.33, 0.33])
}

# Assign occupations based on age
for i, age in enumerate(data['age']):
    if age < 25:
        data['occupation'][i] = 'student'
    elif age > 60:
        data['occupation'][i] = 'retired'
    else:
        data['occupation'][i] = np.random.choice(['freelancer', 'government_job', 'business_owner', 'private_job', 'unemployed'])

# Assign income based on occupation
for i, occ in enumerate(data['occupation']):
    if occ in ['student', 'retired']:
        data['income_lpa'][i] = np.random.uniform(0.5, 4)
    else:
        data['income_lpa'][i] = np.random.uniform(5, 50)

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('insurance_pakistan.csv', index=False)
print("CSV file 'insurance_pakistan.csv' has been created with 1000 rows.")

CSV file 'insurance_pakistan.csv' has been created with 1000 rows.


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [3]:
df = pd.read_csv('/content/insurance_pakistan.csv')

In [4]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
909,59,113.707645,1.844261,46.29643,False,Sargodha,business_owner,Medium
218,45,116.962488,1.699201,42.678424,False,Multan,government_job,Medium
163,69,68.104765,1.675817,2.717972,False,Sargodha,retired,Low
231,38,112.336906,1.9,48.704197,False,Hyderabad,freelancer,Low
326,59,81.11735,1.670333,28.88921,False,Larkana,government_job,High


In [5]:
df['occupation'].unique()

array(['student', 'retired', 'government_job', 'freelancer',
       'private_job', 'business_owner', 'unemployed'], dtype=object)

In [6]:
df_feat = df.copy()

In [7]:
# Feature 1: BMI
df_feat["bmi"] = df_feat["weight"] / (df_feat["height"] ** 2)

In [8]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [9]:
df_feat["age_group"] = df_feat["age"].apply(age_group)

In [10]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"] > 30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [11]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk, axis=1)

In [12]:
tier_1_cities = ["Karachi", "Lahore", "Faisalabad", "Rawalpindi", "Islamabad", "Hyderabad", "Peshawar"]
tier_2_cities = [
    "Multan", "Quetta", "Sialkot", "Gujranwala", "Sargodha", "Bahawalpur", "Sukkur", "Larkana",
    "Sheikhupura", "Jhang", "Rahim Yar Khan", "Gujrat", "Mardan", "Kasur", "Dera Ghazi Khan",
    "Sahiwal", "Nawabshah", "Okara", "Mirpur Khas", "Chiniot", "Sadiqabad", "Burewala",
    "Jacobabad", "Kohat", "Khanewal", "Dera Ismail Khan", "Muzaffargarh", "Abbottabad",
    "Turbat", "Khuzdar", "Vihari", "Hafizabad", "Narowal", "Tando Allahyar", "Mandi Bahauddin",
    "Jhelum", "Khanpur", "Pakpattan", "Tando Muhammad Khan", "Haripur", "Shikarpur", "Khairpur",
    "Chakwal", "Nowshera", "Charsadda", "Mianwali", "Kandhkot", "Kamoke"
]

In [13]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [14]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [15]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
849,18.736746,freelancer,36.903818,adult,high,2,Low
807,47.370619,government_job,28.839614,middle_aged,medium,2,High
225,11.945535,government_job,29.735413,middle_aged,medium,2,Medium
705,16.54713,unemployed,25.967731,adult,medium,2,Low
964,41.39218,government_job,19.671276,middle_aged,low,2,Medium


In [16]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [17]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,31.275655,young,medium,2,2.988284,student
1,18.960516,young,low,2,1.233317,student
2,22.289149,senior,medium,1,1.247922,retired
3,23.469528,adult,low,1,5.976539,government_job
4,29.285135,young,medium,2,3.955342,student
...,...,...,...,...,...,...
995,33.587718,adult,medium,2,25.137765,unemployed
996,24.728469,adult,medium,2,6.193568,private_job
997,24.252574,senior,low,2,3.965630,retired
998,24.128185,senior,medium,1,3.720533,retired


In [18]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Medium
3,Low
4,Medium
...,...
995,Medium
996,Low
997,Low
998,Low


In [19]:
# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [20]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [21]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [22]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [23]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.305

In [24]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
238,29.676975,senior,medium,1,3.929874,retired
971,31.55561,senior,high,1,2.623852,retired
721,17.548017,adult,low,1,33.233519,unemployed
133,25.762694,adult,medium,1,12.881075,private_job
228,40.131082,adult,medium,2,49.71064,business_owner


In [25]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)