In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import matplotlib
mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['axes.grid'] = False
pd.get_option("display.max_columns")

In [None]:
dataframe = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [None]:
df = dataframe.copy()

In [None]:
print("Total Features : ",df.shape[1])
print("Total Data     : ",df.shape[0])

In [None]:
print("Featues : ",df.columns.tolist())

In [None]:
df.info()

**We have only 2 String features and 4 are numeric features**

In [None]:
df.describe()

**We have the data for young and old too, we can see childrens from 0 to 5 but from data we came to know most probably larger population has 1 children.**

In [None]:
axis = sns.heatmap(df.isnull(),cbar=False,cmap='viridis')
axis.set_title('Heatmap for Null Values', size=18)
axis.set_xlabel('Features', size=14)
axis.set_ylabel('Count', size=14)
plt.show()

**We don't have any null values in our dataset**

In [None]:
df.head(2)

In [None]:
axis=sns.histplot(x='charges',kde=True,data=df)
axis.set_title('Distribution of the Expense variable.', size=18)
axis.set_xlabel('Medical Cahrges', size=14)
axis.set_ylabel('Count', size=14)
plt.show()

**Most of the expenses are lies between 0-15000 mostly severe cases require large amount of money to treat**

In [None]:
cat_feat = [feature for feature in df.columns if df[feature].dtype=='O']
print("Categorical Features : ",cat_feat)

**Categorical Features**

In [None]:
for feature in cat_feat:
    print("Unique variables : ",df[feature].unique())

**Unique values in our categorical features**

In [None]:
for feature in cat_feat:
    axis=sns.countplot(x=feature,palette='tab10',data=df)
    for patch in axis.patches:
        axis.text(x = patch.get_x() + patch.get_width()/2, y = patch.get_height()/2,
             s = f"{np.round(patch.get_height()/len(df)*100, 1)}%",
             ha = 'center', size = 25, rotation = 0, weight = 'bold',
             color = 'black')
    plt.xlabel(feature, size=14)
    plt.ylabel('Count', size=14)
    plt.xticks(rotation=90)
    plt.show()

**We have Data :-**
* Gender is in the ratio approx 50%.
* Smokers are in the ratio 20% : 80%.
* Almost equivalent cases from all the four regions.

In [None]:
num_feat = [feature for feature in df.columns if df[feature].dtype!='O']
print("Numerical Features : ",num_feat)

In [None]:
for feature in num_feat:
    plt.figure(figsize=(18,8))
    axis=sns.histplot(x=feature,palette='rocket',kde=True,bins=20,data=df)
    for patch in axis.patches:
        axis.text(x = patch.get_x() + patch.get_width()/2, y = patch.get_height()/2,
             s = f"{np.round(patch.get_height()/len(df)*100, 1)}%",
             ha = 'center', size = 15, rotation = 0, weight = 'bold',
             color = 'black')
    plt.xlabel(feature, size=14)
    plt.ylabel('Count', size=14)
    plt.xticks(rotation=90)
    plt.show()

**From the above Histogram**
* Age is uniformly distributed most probably it will not affect our dependent variable but we will check its corelation with our dependent variable later.
* BMI is forming a nice bell curve meaning the data is normally distributed.
* Children data is Left skewed if it is significant with our dependent data we will transform the data into normal distribution.
* Charges is our dependent variable which we will going to figure it out.

In [None]:
plt.figure(figsize=(12,8))
axis=sns.scatterplot(x='age', y='charges',data=df)
axis.set_xlabel('age', size=14)
axis.set_ylabel('Charges', size=14)
plt.show()

**Somehow medical charges increases with the increase in age but we also have some cases were in low age we have large medical expenses.**

In [None]:
plt.figure(figsize=(12,8))
axis=sns.scatterplot(x='bmi', y='charges',data=df)
axis.set_xlabel('BMI', size=14)
axis.set_ylabel('Charges', size=14)
plt.show()

**We can see charges for mid range BMI peoples are quite higher than the people having low and high BMI and of course we still have some exceptions.**

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(6, 12))
for var, subplot in zip(cat_feat, ax.flatten()):
    sns.boxplot(x=var, y='charges', data=df, ax=subplot)

**From the above subplots, Boxplots We can find:-**
* Average charges for male is higher than the female.
* Charges for smokers are veru high than the non smokers.
* The average charges for southease region is higher than all the regions and we also conclude that the southwest region has may me low cases or only minor cases.


In [None]:
df.head(3)

In [None]:
df['sex'] = df['sex'].replace('female',0)
df['sex'] = df['sex'].replace('male',1)

In [None]:
df['smoker'] = df['smoker'].replace('no',0)
df['smoker'] = df['smoker'].replace('yes',1)

In [None]:
region = pd.get_dummies(df['region'])
region

In [None]:
df = pd.concat([df,region],axis=1)

In [None]:
drop_columns = ['region']

In [None]:
df.drop(drop_columns,axis=1,inplace=True)

In [None]:
df.head(2)

In [None]:
plt.figure(figsize=(10,8))
axis = sns.heatmap(df.corr(),cmap='coolwarm',cbar=True,annot=True)
axis.set_title('correlation Heatmap', size=18)
plt.show()

In [None]:
X = df.drop('charges',axis=1)
y = df['charges']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=101)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [None]:
selector = SelectFromModel(Lasso(alpha=0.05,random_state=101))

In [None]:
selector.fit(X_train,y_train)

In [None]:
selector.get_support()

## **Linear Regression**

In [None]:
lr_regressor = LinearRegression()
lr_regressor.fit(X_train,y_train)
lr_y = lr_regressor.predict(X_test)

In [None]:
sns.distplot(lr_y-y_test)

**Performace Metrics for Linear Regression**

In [None]:
print('Coefficient o R^2 <-- on train data : {}'.format(lr_regressor.score(X_train,y_train)))
print('Coefficient o R^2 <-- on test data : {}'.format(lr_regressor.score(X_test,y_test)))
print("Mean absolute Error : ",mean_absolute_error(lr_y,y_test))
print("Mean squared Error : ",mean_squared_error(lr_y,y_test))
print("Root Mean Squared Error : ",np.sqrt(mean_squared_error(lr_y,y_test)))

## **SVR**

In [None]:
svr_regressor = SVR()
svr_regressor.fit(X_train,y_train)
svr_y = svr_regressor.predict(X_test)

In [None]:
sns.distplot(svr_y-y_test)

**Performace Metrics for SVR**

In [None]:
print('Coefficient o R^2 <-- on train data : {}'.format(svr_regressor.score(X_train,y_train)))
print('Coefficient o R^2 <-- on test data : {}'.format(svr_regressor.score(X_test,y_test)))
print("Mean absolute Error : ",mean_absolute_error(svr_y,y_test))
print("Mean squared Error : ",mean_squared_error(svr_y,y_test))
print("Root Mean Squared Error : ",np.sqrt(mean_squared_error(svr_y,y_test)))

## **K Nearest Neighbour**

In [None]:
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train,y_train)
knn_y = knn_regressor.predict(X_test)

In [None]:
sns.distplot(knn_y-y_test)

**Performace Metrics for KNN**

In [None]:
print('Coefficient o R^2 <-- on train data : {}'.format(knn_regressor.score(X_train,y_train)))
print('Coefficient o R^2 <-- on test data : {}'.format(knn_regressor.score(X_test,y_test)))
print("Mean absolute Error : ",mean_absolute_error(knn_y,y_test))
print("Mean squared Error : ",mean_squared_error(knn_y,y_test))
print("Root Mean Squared Error : ",np.sqrt(mean_squared_error(knn_y,y_test)))

## **Decision Tree**

In [None]:
dt_regressor = DecisionTreeRegressor()
dt_regressor.fit(X_train,y_train)
dt_y = dt_regressor.predict(X_test)

In [None]:
sns.distplot(dt_y-y_test)

**Performace Metrics for Decision Tree**

In [None]:
print('Coefficient o R^2 <-- on train data : {}'.format(dt_regressor.score(X_train,y_train)))
print('Coefficient o R^2 <-- on test data : {}'.format(dt_regressor.score(X_test,y_test)))
print("Mean absolute Error : ",mean_absolute_error(dt_y,y_test))
print("Mean squared Error : ",mean_squared_error(dt_y,y_test))
print("Root Mean Squared Error : ",np.sqrt(mean_squared_error(dt_y,y_test)))

## **Random Forest**

In [None]:
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train,y_train)
rf_y = rf_regressor.predict(X_test)

In [None]:
sns.distplot(rf_y-y_test)

**Performace Metrics for Random Forest**

In [None]:
print('Coefficient o R^2 <-- on train data : {}'.format(rf_regressor.score(X_train,y_train)))
print('Coefficient o R^2 <-- on test data : {}'.format(rf_regressor.score(X_test,y_test)))
print("Mean absolute Error : ",mean_absolute_error(rf_y,y_test))
print("Mean squared Error : ",mean_squared_error(rf_y,y_test))
print("Root Mean Squared Error : ",np.sqrt(mean_squared_error(rf_y,y_test)))

## **XGBoost Regressor**

In [None]:
xgb_regressor = XGBRegressor()
xgb_regressor.fit(X_train,y_train)
xgb_y = xgb_regressor.predict(X_test)

In [None]:
sns.distplot(xgb_y-y_test)

**Performace Metrics for XGBost Regressor**

In [None]:
print('Coefficient o R^2 <-- on train data : {}'.format(xgb_regressor.score(X_train,y_train)))
print('Coefficient o R^2 <-- on test data : {}'.format(xgb_regressor.score(X_test,y_test)))
print("Mean absolute Error : ",mean_absolute_error(xgb_y,y_test))
print("Mean squared Error : ",mean_squared_error(xgb_y,y_test))
print("Root Mean Squared Error : ",np.sqrt(mean_squared_error(xgb_y,y_test)))

## **Adaboost**

In [None]:
adab_regressor = AdaBoostRegressor()
adab_regressor.fit(X_train,y_train)
adb_y = adab_regressor.predict(X_test)

In [None]:
sns.distplot(adb_y-y_test)

**Performace Metrics for Adaboost**

In [None]:
print('Coefficient o R^2 <-- on train data : {}'.format(adab_regressor.score(X_train,y_train)))
print('Coefficient o R^2 <-- on test data : {}'.format(adab_regressor.score(X_test,y_test)))
print("Mean absolute Error : ",mean_absolute_error(adb_y,y_test))
print("Mean squared Error : ",mean_squared_error(adb_y,y_test))
print("Root Mean Squared Error : ",np.sqrt(mean_squared_error(adb_y,y_test)))

# **Performance Metrics**

In [None]:
lr_df = pd.DataFrame(data=[lr_regressor.score(X_train,y_train),lr_regressor.score(X_test, y_test), mean_absolute_error(y_test, lr_y), mean_squared_error(y_test, lr_y), np.sqrt(mean_squared_error(y_test, lr_y))], 
             columns=['Linear Regression'], index=["R2 Score Train","R2 Score Test", "Root Mean Absolute Error", "Mean Square Error", "Root Mean Square Error"])
svr_df = pd.DataFrame(data=[svr_regressor.score(X_train,y_train),svr_regressor.score(X_test, y_test), mean_absolute_error(y_test, svr_y),mean_squared_error(y_test, svr_y), np.sqrt(mean_squared_error(y_test, svr_y))], 
             columns=['Support Vector Regressor'],index=["R2 Score Train","R2 Score Test", "Root Mean Absolute Error", "Mean Square Error", "Root Mean Square Error"])
knn_df = pd.DataFrame(data=[knn_regressor.score(X_train,y_train),knn_regressor.score(X_test, y_test), mean_absolute_error(y_test, knn_y), mean_squared_error(y_test, knn_y), np.sqrt(mean_squared_error(y_test, knn_y))], 
             columns=['K Nearest Neighbour'], index=["R2 Score Train","R2 Score Test", "Root Mean Absolute Error", "Mean Square Error", "Root Mean Square Error"])

dt_df = pd.DataFrame(data=[dt_regressor.score(X_train,y_train),dt_regressor.score(X_test, y_test), mean_absolute_error(y_test, dt_y), mean_squared_error(y_test, dt_y), np.sqrt(mean_squared_error(y_test, dt_y))], 
             columns=['Decision Tree'], index=["R2 Score Train","R2 Score Test", "Root Mean Absolute Error", "Mean Square Error", "Root Mean Square Error"])
rf_df = pd.DataFrame(data=[rf_regressor.score(X_train,y_train),rf_regressor.score(X_test, y_test), mean_absolute_error(y_test, rf_y), mean_squared_error(y_test, rf_y), np.sqrt(mean_squared_error(y_test,rf_y))], 
             columns=['Random Forest Score'], index=["R2 Score Train","R2 Score Test", "Root Mean Absolute Error", "Mean Square Error", "Root Mean Square Error"])
xgb_df = pd.DataFrame(data=[xgb_regressor.score(X_train,y_train),xgb_regressor.score(X_test, y_test), mean_absolute_error(y_test, xgb_y), mean_squared_error(y_test, xgb_y), np.sqrt(mean_squared_error(y_test,xgb_y))], 
             columns=['XGBoost'], index=["R2 Score Train","R2 Score Test", "Root Mean Absolute Error", "Mean Square Error", "Root Mean Square Error"])
adb_df = pd.DataFrame(data=[adab_regressor.score(X_train,y_train),adab_regressor.score(X_test, y_test), mean_absolute_error(y_test, adb_y), mean_squared_error(y_test, adb_y), np.sqrt(mean_squared_error(y_test,adb_y))], 
             columns=['AdaBoost'], index=["R2 Score Train","R2 Score Test", "Root Mean Absolute Error", "Mean Square Error", "Root Mean Square Error"])


df_models = round(pd.concat([lr_df,svr_df,knn_df,dt_df,rf_df,xgb_df,adb_df], axis=1),3)
colors = ["bisque","ivory","sandybrown","steelblue","lightsalmon"]
colormap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

background_color = "white"

fig = plt.figure(figsize=(18,26)) # create figure
gs = fig.add_gridspec(4, 2)
gs.update(wspace=0.1, hspace=0.5)
ax0 = fig.add_subplot(gs[0, :])

sns.heatmap(df_models.T, cmap=colormap,annot=True,fmt=".1%",vmin=0,vmax=0.95, linewidths=2.5,cbar=False,ax=ax0,annot_kws={"fontsize":16})
fig.patch.set_facecolor(background_color) # figure background color
ax0.set_facecolor(background_color) 

ax0.text(0,-0.5,'Model Comparison',fontsize=20,fontweight='bold',fontfamily='serif')
plt.show()