In [None]:
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle
import numpy as np 

In [43]:
# Load the new dataset
dataset = pd.read_csv('medical_insurance.csv') # Assuming medical_insurance.csv is in the same directory

# Simulate smoker column (10% smokers, 90% non-smokers)
dataset['smoker'] = np.random.choice(['yes', 'no'], size=len(dataset), p=[0.1, 0.9])

In [44]:
# Display head of the dataset with new smoker column
print("Original Insurance Dataset Head with Simulated Smoker Column:")
print(dataset.head())


Original Insurance Dataset Head with Simulated Smoker Column:
   age  gender   bmi  children discount_eligibility     region  expenses  \
0   19  female  27.9         0                  yes  southwest  16884.92   
1   18    male  33.8         1                   no  southeast   1725.55   
2   28    male  33.0         3                   no  southeast   4449.46   
3   33    male  22.7         0                   no  northwest  21984.47   
4   32    male  28.9         0                   no  northwest   3866.86   

    premium smoker  
0  168.8492     no  
1   17.2555     no  
2   44.4946     no  
3  439.6894     no  
4   77.3372     no  


DATA PREPROCESSING

In [45]:
# Preprocessing for charges prediction
# Rename 'gender' to 'sex' and 'expenses' to 'charges' for consistency with previous code structure
dataset.rename(columns={'gender': 'sex', 'expenses': 'charges'}, inplace=True)

Encoding

In [46]:
# Initialize and fit LabelEncoder for 'sex'
le_gender = LabelEncoder()
le_gender.fit(["male", "female"]) # Fit with all possible values to avoid errors
dataset["sex"] = le_gender.transform(dataset["sex"])


In [47]:
le_smoker = LabelEncoder()
le_smoker.fit(["yes", "no"])
dataset["smoker"] = le_smoker.transform(dataset["smoker"])


In [48]:
with open("label_encoder_gender.pkl", "wb") as f:
    pickle.dump(le_gender, f)

with open("label_encoder_smoker.pkl", "wb") as f:
    pickle.dump(le_smoker, f)

In [49]:
# One-hot encode 'region'
dataset=pd.get_dummies(dataset, columns=['region'], drop_first=True, dtype=int)

In [50]:
#check for null values
print(dataset.isnull().sum())

age                     0
sex                     0
bmi                     0
children                0
discount_eligibility    0
charges                 0
premium                 0
smoker                  0
region_northwest        0
region_southeast        0
region_southwest        0
dtype: int64


#check correlation

splitting data for input n output

The processed insurance.csv data is split into features (X) and target (y), where 'charges' is the target variable. 

In [51]:

# Define features (x) and target (y) for charges prediction
x = dataset.drop(["charges", "discount_eligibility","premium"], axis=1) # Drop target and 'discount_eligibility'
y = dataset["charges"]

splitting data for training and testing

 The data is then further divided into training and testing sets (80% for training, 20% for testing). 

In [52]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

model training

A Linear Regression model is chosen and trained on the training data. The model's performance is then evaluated by checking its accuracy on both the training and test sets.

In [53]:
lr=LinearRegression()
lr.fit(x_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


now check score

In [54]:
print("\nLinear Regression Model Scores:")
print(f"Train Score: {lr.score(x_train, y_train)*100:.2f}%")
print(f"Test Score: {lr.score(x_test, y_test)*100:.2f}%")



Linear Regression Model Scores:
Train Score: 11.61%
Test Score: 16.16%




DIABETES RISK PREDICTION

In [55]:
df_diabetes = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')
print(df_diabetes.head())

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

In [56]:
df_diabetes.isnull().sum()

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [57]:
# Select features (X) and target (y) for diabetes prediction
X_diabetes = df_diabetes.drop("Diabetes_binary", axis=1)
y_diabetes = df_diabetes["Diabetes_binary"]
model_features_diabetes = X_diabetes.columns.tolist()

Split Data into Train and Test Sets

In [58]:
#We’re using 80% of the data to train and 20% to test.

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_diabetes, y_diabetes, test_size=0.2, random_state=42
)


train the classifier

In [59]:
#Let’s use a Random Forest Classifier (great for health classification)

# Initialize and train the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_d, y_train_d)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Check Accuracy

In [60]:

y_pred_d = rf_model.predict(X_test_d)
accuracy = accuracy_score(y_test_d, y_pred_d)
print(f"\nDiabetes Prediction Model Accuracy: {accuracy:.4f}")



Diabetes Prediction Model Accuracy: 0.8599


Let’s now:

Use the diabetes model to predict if a user has high health risk

Convert that prediction into a label: Low, Medium, or High



In [61]:
# Predict on entire diabetes dataset
diabetes_predictions = rf_model.predict(X_diabetes)

# Add predictions to original dataframe
df_diabetes["predicted_diabetes"] = diabetes_predictions

# Now map risk level (basic logic — you can improve later)
# def map_risk(diabetes):
#     if diabetes == 1:
#         return "High"
#     else:
#         return "Low"

# df_diabetes["risk_level"] = df_diabetes["predicted_diabetes"].apply(map_risk)

# # Preview the result
# df_diabetes[["predicted_diabetes", "risk_level"]].head()


Explanation:
You use your trained model to make predictions on all users

Then, based on prediction:

If 1 → user has diabetes → High risk

If 0 → user doesn't have diabetes → Low risk

🟡 (Later, we can add Medium risk using BMI, smoking, or age if you want!)



We’ll consider:

BMI: high BMI increases risk

Smoking: smoking increases risk

Heart disease or stroke: very high risk

Diabetes: high risk

In [62]:
def map_age_group(code):
    if code <= 6:
        return "young"
    elif 7 <= code <= 9:
        return "middle"
    else:
        return "senior"

def custom_risk(row):
    age_group = map_age_group(row["Age"])
    if row["HeartDiseaseorAttack"] == 1 or row["Stroke"] == 1:
        return "High"
    elif row["predicted_diabetes"] == 1 and (row["BMI"] > 30 or row["Smoker"] == 1):
        return "High"
    elif age_group == "senior":
        return "High"
    elif row["BMI"] > 30 or row["Smoker"] == 1 or row["HighBP"] == 1 or age_group == "middle":
        return "Medium"
    else:
        return "Low"

df_diabetes["risk_level"] = df_diabetes.apply(custom_risk, axis=1)

print("\nDiabetes Data with Predicted Risk Level:")
print(df_diabetes[["Age", "BMI", "Smoker", "HeartDiseaseorAttack", "Stroke", "risk_level"]].head(10))






Diabetes Data with Predicted Risk Level:
    Age   BMI  Smoker  HeartDiseaseorAttack  Stroke risk_level
0   9.0  40.0     1.0                   0.0     0.0     Medium
1   7.0  25.0     1.0                   0.0     0.0     Medium
2   9.0  28.0     0.0                   0.0     0.0     Medium
3  11.0  27.0     0.0                   0.0     0.0       High
4  11.0  24.0     0.0                   0.0     0.0       High
5  10.0  25.0     1.0                   0.0     0.0       High
6   9.0  30.0     1.0                   0.0     0.0     Medium
7  11.0  25.0     1.0                   0.0     0.0       High
8   9.0  30.0     1.0                   1.0     0.0       High
9   8.0  24.0     0.0                   0.0     0.0     Medium


In [63]:
# --- Merge Risk Info with Insurance Data ---
# Prepare the insurance data to align with the risk prediction logic's feature expectations.
insurance_for_risk = dataset.copy()

# Map smoker and sex again
insurance_for_risk["Smoker"] = insurance_for_risk["smoker"]
insurance_for_risk["Sex"] = insurance_for_risk["sex"]
insurance_for_risk["BMI"] = insurance_for_risk["bmi"]

# Create a fake 'Age' code using the same bins as BRFSS
insurance_for_risk["Age"] = pd.cut(insurance_for_risk["age"],
    bins=[0, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 200],
    labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
).astype(float)


Add missing dummy features for prediction
Your diabetes model may expect features like:

HighBP, HeartDiseaseorAttack, Stroke, PhysActivity, etc.

➡️ Since these are not in the insurance dataset, we’ll set them to 0 as placeholder values:

In [64]:
# Add missing dummy features for diabetes prediction model (set to 0.0 as placeholders)
# These columns are expected by rf_classifier based on df_diabetes.
for col in ["HighBP", "HighChol", "CholCheck", "PhysActivity", "Fruits", "Veggies",
            "HvyAlcoholConsump", "AnyHealthcare", "NoDocbcCost", "GenHlth",
            "MentHlth", "PhysHlth", "DiffWalk", "Education", "Income", "Stroke", "HeartDiseaseorAttack"]:
    if col not in insurance_for_risk.columns:
        insurance_for_risk[col] = 0.0 # Default to 0 if not present

In [65]:
# Ensure the columns for prediction are in the correct order for the diabetes classifier
X_ins_risk_features = [col for col in model_features_diabetes if col != "Diabetes_binary"] # Exclude target if accidentally included
X_ins = insurance_for_risk[X_ins_risk_features]

insurance_for_risk["predicted_diabetes"] = rf_model.predict(X_ins)


In [70]:
# Apply the custom risk logic to the insurance data
insurance_for_risk["risk_level"] = insurance_for_risk.apply(custom_risk, axis=1)

print("\nInsurance Data with Predicted Risk Level (First 10 rows):")
# Now including the simulated 'smoker' in the output display
print(insurance_for_risk[["age", "bmi", "smoker", "discount_eligibility", "risk_level", "charges"]].head(10))

# --- Save Models and Encoders ---
with open("charges_model.pkl", "wb") as f:
    pickle.dump(lr, f)

with open("diabetes_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

# Saving the two specific label encoders
with open("label_encoder_gender.pkl", "wb") as f:
    pickle.dump(le_gender, f)

with open("label_encoder_smoker.pkl", "wb") as f:
    pickle.dump(le_smoker, f)


Insurance Data with Predicted Risk Level (First 10 rows):
   age   bmi  smoker discount_eligibility risk_level   charges
0   19  27.9       0                  yes        Low  16884.92
1   18  33.8       0                   no     Medium   1725.55
2   28  33.0       0                   no     Medium   4449.46
3   33  22.7       0                   no        Low  21984.47
4   32  28.9       0                   no        Low   3866.86
5   31  25.7       0                   no        Low   3756.62
6   46  33.4       0                   no     Medium   8240.59
7   37  27.7       0                   no        Low   7281.51
8   37  29.8       1                   no     Medium   6406.41
9   60  25.8       0                   no     Medium  28923.14
