In [86]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# for data visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# setting parameters for visualization
plt.rcParams['figure.figsize'] = (16, 5)
plt.style.use('fivethirtyeight')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Reading and Understanding of Data**

In [87]:
df=pd.read_csv('/kaggle/input/medical-insurance/Insurance.csv')
df.head()

In [88]:
df.isna().sum()

In [89]:
df.info()

In [90]:
df.describe().style.background_gradient(cmap = 'Greens')

**Univariate Analysis**

In [91]:
import warnings
warnings.filterwarnings('ignore')
plt.subplot(1,3,1)
plt.title('Smokers',y=-0.01,fontsize = 15)
plt.pie(df['smoker'].value_counts().values,labels=df['smoker'].value_counts().index,colors=['silver','black'],
       startangle=90,shadow=True,explode=[0.1,0])
  
plt.subplot(1, 3, 2)
sns.countplot(df['children'], palette = 'magma')
plt.grid()

plt.subplot(1, 3, 3)
plt.title('Regions',y=-0.01,fontsize = 15)
plt.pie(df['region'].value_counts().values,
        labels = df['region'].value_counts().index,
        colors = ['gold','silver','grey','black'],
        startangle = 90,
        shadow = True,
       explode = [0.1, 0, 0, 0])
plt.suptitle('Distribution of Smoker, Children and Regions', fontsize = 20)
plt.show

In [92]:
plt.subplot(1, 3, 1)
sns.distplot(df['age'], color = 'black')
plt.xlabel('Age')
plt.grid()

plt.subplot(1, 3, 2)
sns.distplot(df['bmi'], color = 'green')
plt.xlabel('BMI')
plt.grid()

plt.subplot(1, 3, 3)
sns.distplot(df['charges'], color = 'red')
plt.xlabel('Charges')
plt.grid()

plt.suptitle('Distribution of Age, BMI, and Expenses', fontsize = 15)
plt.show()

**Bivariate Analysis**

In [93]:
import plotly.express as px
px.scatter(df, y = 'charges',
           x = 'age',
           marginal_y = 'violin',
           trendline = 'ols')

In [94]:
plt.subplot(1, 2, 1)
sns.boxplot(df['children'], df['charges'])

plt.subplot(1, 2, 2)
sns.boxplot(df['smoker'], df['charges'])

plt.suptitle('Impact of Smoking and Childrens on Expenses', fontsize = 20)
plt.show()

In [95]:
px.scatter(df,
                 y="charges",
                 x="bmi",
                 size="age",
                 color="smoker",
           hover_name="charges", size_max=15)

In [96]:
df[['charges', 'region']].groupby(['region']).agg(['min','mean','max']).style.background_gradient(cmap = 'Wistia')

In [97]:
pd.pivot_table(df, index=['smoker'], values =["age"], columns= ['region'],aggfunc='count')


In [98]:
df['sex'] = df['sex'].map({'male':2,'female': 1})
df['smoker'] = df['smoker'].map({'yes':2,'no':1})
df['region'] = df['region'].replace(('southeast','southwest','northeast','northwest'),(1, 2, 3, 4))

In [99]:
print('Region: ')
print('-----------')
print(df['region'].value_counts())
print('sex: ')
print('-----------')
print(df['sex'].value_counts())
print('Smoker: ')
print('-----------')
print(df['smoker'].value_counts())


In [100]:

pd.pivot_table(df, index=['smoker'], values =["age"], columns= ['region'],
               aggfunc='count')

In [102]:
y = df['charges']
x = df.drop(['charges'], axis = 1)

print(y.shape)
print(x.columns)

**Data Splitting**

In [103]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

**Standardization**

In [104]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# **Linear Regression**

In [105]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()
model1.fit(x_train, y_train)

y_pred1 = model1.predict(x_test)

In [106]:
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_pred1)
rmse = np.sqrt(mse)
print("RMSE Score :", rmse)

r2_score = r2_score(y_test, y_pred1)
print("R2 Score :",r2_score)

# **Random Forest Regressor**

In [107]:
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor()
model2.fit(x_train, y_train)

y_pred2 = model2.predict(x_test)

# lets check the Model accuracy
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_pred2)
rmse = np.sqrt(mse)
print("RMSE Score :", rmse)

r2_score = r2_score(y_test, y_pred2)
print("R2 Score :",r2_score)

# **Gradient Boosting**

In [108]:
from sklearn.ensemble import GradientBoostingRegressor

model3 = GradientBoostingRegressor()
model3.fit(x_train, y_train)

y_pred3 = model3.predict(x_test)

# lets check the Model accuracy
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, y_pred3)
rmse = np.sqrt(mse)
print("RMSE Score :", rmse)

r2_score = r2_score(y_test, y_pred3)
print("R2 Score :",r2_score)

In [109]:
from sklearn.metrics import r2_score, mean_squared_error


avg_model = (y_pred1 + y_pred2 + y_pred3)/3
mse = mean_squared_error(y_test, avg_model)
rmse = np.sqrt(mse)
print("RMSE Score :", rmse)

r2_score = r2_score(y_test, avg_model)
print("R2 Score :",r2_score)

In [110]:
# lets create an weighted averaging model

# lets give 50% weight to gradient boosting
# 30% weight to random forest
# and 20% weight to linear regression

weight_avg_model = 0.2*y_pred1 + 0.3*y_pred2 + 0.5*y_pred3

# lets check the Model accuracy
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, weight_avg_model)
rmse = np.sqrt(mse)
print("RMSE Score :", rmse)

r2_score = r2_score(y_test, weight_avg_model)
print("R2 Score :",r2_score)

In [111]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model3, x, y, cv=6)
print(scores)

**Comparision of accuracies of the models**

In [113]:
r2_score = np.array([0.79, 0.87, 0.89])
labels = np.array(['Linear Regression', 'Random Forest' 'Gradient Boosting'])
index = np.argsort(r2_score)
color = plt.cm.rainbow(np.linspace(0, 1, 4))

plt.bar(range(len(index)), r2_score[index], color = color)
plt.xticks(range(0, 3), ['Linear Regression', 'Random Forest','Gradient Boosting'], rotation = 90)
plt.title('Comparison of r2 Score', fontsize = 15)
plt.show()

Among Regression algorithms : *'Linear Regression', 'Random Forest' 'Gradient Boosting'* ,**Gradient Boosting Algorithm** has maximum accuracy 89%.