In [None]:
import numpy as np
import pandas as pd

#Data Processing

In [None]:
df = pd.read_csv("/content/insurance.csv")
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


In [None]:
#Handling Missing Values
df.isna().sum()

Unnamed: 0,0
Age,0
Diabetes,0
BloodPressureProblems,0
AnyTransplants,0
AnyChronicDiseases,0
Height,0
Weight,0
KnownAllergies,0
HistoryOfCancerInFamily,0
NumberOfMajorSurgeries,0


Dataset does not contain any missing values.

In [None]:
#Feature Engineering

#create BMI
df["BMI"] = df["Weight"]/(df["Height"]/100)**2

In [None]:
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,BMI
0,45,0,0,0,0,155,57,0,0,0,25000,23.725286
1,60,1,0,0,0,180,73,0,0,0,29000,22.530864
2,36,1,1,0,0,158,59,0,0,1,23000,23.634033
3,52,1,1,0,1,183,93,0,0,2,28000,27.770313
4,38,0,0,0,1,166,88,0,0,1,23000,31.934969


In [None]:
df.BMI.describe()

Unnamed: 0,BMI
count,986.0
mean,27.460709
std,5.878671
min,15.156281
25%,23.393392
50%,27.156602
75%,30.75987
max,50.0


In [None]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

numerical_cols = ["BMI","Weight","Height","Age"]

df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,BMI
0,0.233197,0,0,0,0,-1.306105,-1.39925,0,0,0,25000,-0.635742
1,1.307981,1,0,0,0,1.170852,-0.277062,0,0,0,29000,-0.839024
2,-0.411674,1,1,0,0,-1.00887,-1.258976,0,0,1,23000,-0.651273
3,0.734763,1,1,0,1,1.468086,1.125674,0,0,2,28000,0.052692
4,-0.268369,0,0,0,1,-0.216244,0.77499,0,0,1,23000,0.761487


#2. Model Selection

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
X = df.drop("PremiumPrice",axis=1)
y = df["PremiumPrice"]

X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [None]:
#Lineae Regression

model = LinearRegression()
model.fit(X_train,y_train)


y_pred = model.predict(x_test)

mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print('Linear model Metrics\n')
print("MSE:", mse)
print("MAE:", mae)
print("R2_score:", r2)

Linear model Metrics

MSE: 12210927.792979369
MAE: 2586.181710020061
R2_score: 0.7136461439649742


In [None]:
#Tree-Based model

from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

dt_y_pred = dt_model.predict(x_test)

dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_mae = mean_absolute_error(y_test, dt_y_pred)
dt_r2 = r2_score(y_test, dt_y_pred)

print('Decision Tree Metrics\n')
print('MSE:', dt_mse)
print('MAE:', dt_mae)
print('R2_score:', dt_r2)

Decision Tree Metrics

MSE: 16065656.565656565
MAE: 1287.878787878788
R2_score: 0.6232503553124596


In [None]:
#Random Forest

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(x_test)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print('Random Forest Metrics\n')
print('MSE:', rf_mse)
print('MAE:', rf_mae)
print('R2_score:', rf_r2)

Random Forest Metrics

MSE: 4583632.828282828
MAE: 1025.5050505050506
R2_score: 0.8925109576209139


In [None]:
#Gradiert boosting

from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

gb_y_pred = gb_model.predict(x_test)

gb_mse = mean_squared_error(y_test, gb_y_pred)
gb_mae = mean_absolute_error(y_test, gb_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)

print('Gradient Boosting Metrics\n')
print('MSE:', gb_mse)
print('MAE:', gb_mae)
print('R2_score:', gb_r2)

Gradient Boosting Metrics

MSE: 5725274.915596395
MAE: 1525.3346483957273
R2_score: 0.8657387401894037


#Summary

Linear Regression:


* MSE: 12210927.792979369
* MAE: 2586.181710020061
* R2_score: 0.7136461439649742


Decision Tree Regression:



*  MSE: 16065656.565656565
* MAE: 1287.878787878788
* R2_score: 0.6232



Random Forest Regression:

* MSE: 4583632.828282828
* MAE: 1025.5050505050506
* R2_score: 0.8925



Gradient Boosting Regression:
* MSE: 5725274.915596395
* MAE: 1525.3346
* R2_score: 0.8657


**Random Forest model has shown the best performance**






#Hyperparameter Tuning for Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_features': [ 'sqrt', 'log2'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid,
                           cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
rf_y_pred = best_rf_model.predict(x_test)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print('Optimized Random Forest\n')
print('MSE:', rf_mse)
print('MAE:', rf_mae)
print('R2:', rf_r2)

  _data = np.array(data, dtype=dtype, copy=copy,


Optimized Random Forest

MSE: 7101745.549242424
MAE: 1814.229797979798
R2: 0.833459647204337


In [None]:
import pickle
with open('model.pkl', 'wb') as f:
  pickle.dump(best_rf_model,f)