In [8]:
import numpy as np
import pandas as pd
df=pd.read_csv("../data/insurance_featured.csv",index_col=0)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest,age_group,bmi_category
0,19,1,27.9,0,1,16884.924,0,0,1,Teen,Overweight
1,18,0,33.77,1,0,1725.5523,0,1,0,Teen,Obese
2,28,0,33.0,3,0,4449.462,0,1,0,Adults,Obese
3,33,0,22.705,0,0,21984.47061,1,0,0,Adults,Normal
4,32,0,28.88,0,0,3866.8552,1,0,0,Adults,Overweight


In [9]:
df.shape

(1338, 11)

In [10]:
df2=df.drop(columns=["age_group","bmi_category"])
df2

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,1,27.900,0,1,16884.92400,0,0,1
1,18,0,33.770,1,0,1725.55230,0,1,0
2,28,0,33.000,3,0,4449.46200,0,1,0
3,33,0,22.705,0,0,21984.47061,1,0,0
4,32,0,28.880,0,0,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,10600.54830,1,0,0
1334,18,1,31.920,0,0,2205.98080,0,0,0
1335,18,1,36.850,0,0,1629.83350,0,1,0
1336,21,1,25.800,0,0,2007.94500,0,0,1


In [11]:
df2.shape

(1338, 9)

In [12]:
#Splitting dataset
from sklearn.model_selection import train_test_split
X=df2.drop(columns=["charges"],axis=1)
y=df2["charges"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)
model.predict(X_test)
model.score(X_test,y_test)

0.7835929767120724

In [14]:
#Cross Validation
from sklearn.model_selection import ShuffleSplit,cross_val_score
cv=ShuffleSplit(n_splits=25,test_size=400,random_state=42)
cross_val_score(LinearRegression(),X,y,cv=cv)


array([0.76940723, 0.73319114, 0.81311399, 0.71151473, 0.71816065,
       0.72957842, 0.73432713, 0.7620016 , 0.74143638, 0.74006818,
       0.76860883, 0.73365802, 0.75527986, 0.75388332, 0.74711542,
       0.78626785, 0.73993178, 0.76533203, 0.74345081, 0.77339916,
       0.72290605, 0.7629774 , 0.76408117, 0.78057445, 0.75375651])

In [15]:
from sklearn.model_selection import GridSearchCV,ShuffleSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
def find_model_gridsearchcv(X,y):
    algo={
        'decision_tree':{
            'model':DecisionTreeRegressor(random_state=42),
            'params':{
                'criterion':['squared_error','friedman_mse','absolute_error'],
                'splitter':['best','random']
            }
        },
        'random_forest':{
            'model':RandomForestRegressor(random_state=42),
            'params':{
                'criterion':['squared_error','friedman_mse','absolute_error'],
                'n_estimators':[400,450]}
        },
        'xgboost':{
            'model':XGBRegressor(random_state=42,eval_metric="rmse"),
            'params':{
                'objective':["reg:squarederror","reg:absoluteerror"],
                'n_estimators':[45,50],
                'learning_rate':[0.05,0.1,0.15,0.2]
            }
        }
    }
    scores=[]
    cv=ShuffleSplit(n_splits=5,test_size=15,random_state=42)
    for algo_name,config in algo.items():
        gs=GridSearchCV(config['model'],config['params'],cv=cv,n_jobs=-1,scoring='r2')
        gs.fit(X,y)
        scores.append({
            'model':algo_name,
            'best_score':gs.best_score_,
            "best_params":gs.best_params_
        })
    return pd.DataFrame(scores,columns=["model","best_score","best_params"])
df3 = find_model_gridsearchcv(X,y)
for i, row in df3.iterrows():
    print("\nModel:", row['model'])
    print("Best Score:", row['best_score'])
    print("Best Params:", row['best_params'])


Model: decision_tree
Best Score: 0.7803686929573632
Best Params: {'criterion': 'absolute_error', 'splitter': 'best'}

Model: random_forest
Best Score: 0.820322748727353
Best Params: {'criterion': 'friedman_mse', 'n_estimators': 450}

Model: xgboost
Best Score: 0.8743951213734107
Best Params: {'learning_rate': 0.2, 'n_estimators': 45, 'objective': 'reg:absoluteerror'}


In [16]:
from xgboost import XGBRegressor
model_reg=XGBRegressor(learning_rate=0.1,n_estimators=110,objective="reg:absoluteerror",random_state=42,eval_metric="rmse")
model_reg.fit(X_train,y_train)
model_reg.predict(X_test)
model_reg.score(X_test,y_test)

0.863540608306141

In [29]:
import numpy as np
import pandas as pd

def predict_price(age, sex, bmi, children, smoker, location):
    # start with zero row matching training columns
    x = np.zeros(len(X.columns))

    # create a Series so you can align values to column names directly
    input_data = {
        "age": age,
        "sex": 1 if sex.lower() in ["female", "f", "1"] else 0,
        "bmi": bmi,
        "children": children,
        "smoker": 1 if smoker.lower() in ["yes", "y", "1"] else 0,
    }

    # fill in the one-hot location column (e.g., region_southwest = 1)
    loc_col = f"region_{location.lower()}"
    if loc_col in X.columns:
        input_data[loc_col] = 1

    # turn into pandas DataFrame with same columns as training
    df_input = pd.DataFrame([input_data], columns=X.columns).fillna(0)

    return model_reg.predict(df_input)[0]


In [33]:
predict_price(age=35, sex="female", bmi=30, children=0, smoker="yes", location="northeast")

22921.768

In [34]:
predict_price(age=35, sex="male", bmi=30, children=0, smoker="yes", location="northeast")

22889.453

In [35]:
predict_price(age=35, sex="female", bmi=50, children=0, smoker="yes", location="northeast")

43310.96

In [36]:
predict_price(age=35, sex="male", bmi=50, children=0, smoker="yes", location="northeast")

41489.293

In [37]:
predict_price(age=35, sex="female", bmi=100, children=0, smoker="yes", location="northeast")

43310.96

In [38]:
predict_price(age=35, sex="male", bmi=100, children=0, smoker="yes", location="northeast")

41489.293

In [39]:
predict_price(age=35, sex="female", bmi=30, children=0, smoker="no", location="northeast")

6557.858

In [48]:
predict_price(age=80, sex="female", bmi=90, children=5, smoker="yes", location="northeast")

38463.65

In [49]:
import joblib
joblib.dump(model_reg,"insurance_model.pkl")