In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
from plotly.subplots import make_subplots

In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# Univariate Analysis

In [None]:
sns.distplot(x=df['age'])

In [None]:
fig = go.Figure()

fig.add_traces(go.Histogram(x=df['age'],
                           xbins=dict(
                           size=2),
                           opacity=1))

fig.update_layout(title_text='Age Distribution',
                 xaxis_title='Age',
                 yaxis_title='Count',
                 xaxis={'showgrid':False},
                 yaxis={'showgrid':False},
                 bargap=0.5,
                 template='ggplot2',
                 height=500,
                 width=900)
fig.show()

In [None]:
trace0 = go.Bar(x=df['sex'].value_counts().index, y=df['sex'].value_counts(), text=(df['sex'].value_counts()/len(df['sex'])*100))

trace1 = go.Bar(x=df['smoker'].value_counts().index, y=df['smoker'].value_counts(), text=(df['smoker'].value_counts()/len(df['smoker'])*100))

trace2 =go.Bar(x=df['region'].value_counts().index, y=df['region'].value_counts(), text=(df['region'].value_counts()/len(df['region'])*100))

trace3 =go.Bar(x=df['children'].value_counts().index, y=df['children'].value_counts(), text=(df['children'].value_counts()/len(df['children'])*100))

fig = make_subplots(rows=2, cols=2, specs=[[{'type':'bar'},{'type':'bar'}],
                                          [{'type':'bar'},{'type':'bar'}]],
                   subplot_titles=['Male  VS  Female','Smoker  VS NoN-Smoker', 'Regions', 'Childern'])

fig.append_trace(trace0,1,1)
fig.append_trace(trace1,1,2)
fig.append_trace(trace2,2,1)
fig.append_trace(trace3,2,2)

fig.update_traces(marker_color=['lightblue','lightpink','aquamarine','silver','snow','slategray'],textposition='outside', texttemplate= '%{text:.4s}%')
fig['layout'].update(height=1000, width=1200, title='Univariate Analysis')

fig.show()

In [None]:
sns.distplot(x = df['bmi'])

In [None]:
fig= go.Figure()
fig.add_traces(go.Histogram(x=df['bmi'],
                 xbins=dict(
                 size=2),
                 opacity=1))

fig.update_layout(title_text='BMI Distribution',
                 xaxis_title='BMI',
                 yaxis_title='Count',
                 xaxis = {'showgrid':False},
                 yaxis= {'showgrid':False},
                 template='seaborn',
                 bargap=0.5,
                 height=500,
                 width=600)

fig.show()

In [None]:
sns.distplot(x=df['charges'])

In [None]:
fig = px.histogram(df,x='charges', title='Charges', nbins=200)
fig.show()

# Bivariate Analysis

In [None]:
fig = px.box(df ,x='age', y='charges', title='Medical Charges According To Age')
fig.show()

In [None]:
ax = df.groupby('age')['charges'].mean().reset_index().sort_values(by='charges', ascending=False)
ax.plot(kind='bar', figsize=(15,7), color='lightgreen')
plt.xlabel('Age', fontsize=12)
plt.ylabel('Average Charges', fontsize=12)
plt.title('Average Charges According To Age', fontsize=17)

In [None]:
fig = px.box(x=df['sex'], y=df['charges'], height=500, width=700)
fig.show()

In [None]:
gen =df.groupby('sex')['charges'].mean().reset_index()
fig = px.bar(x=gen['sex'], y=gen['charges'], height=500, width=600, title='Average Medical Charges of Male and Female')
fig.update_traces(marker_color=['lightpink','lightblue'])
fig.show()

In [None]:
fig = px.box(df, x='children', y='charges')
fig.show()

In [None]:
child = df.groupby('children')['charges'].mean().reset_index().sort_values(by='charges',ascending=False)
fig = px.bar(x=child['children'], y=child['charges'], height=500, width=700, title='Average Medical Charges With Children')
fig.update_traces(marker_color=['teal','darkcyan','cadetblue','lightseagreen','darkturquoise','mediumturquoise'])
fig.show()

In [None]:
fig = px.box(df, x='smoker', y='charges', title='Medical Charges Of Smoker VS NoN-Smoker', height=500, width=600)
fig.show()

In [None]:
smoker = df.groupby('smoker')['charges'].mean().reset_index()

fig = px.bar(x=smoker['smoker'], y=smoker['charges'], height=500, width=600, title='Average Medical Charges OF Smoker VS NoN-Smoker')
fig.update_traces(marker_color=['gainsboro','gray '])

In [None]:
fig = px.box(df, x='region', y='charges', height=500, width=700, title='Medical Charges In US Regions')
fig.show()

In [None]:
regions = df.groupby('region')['charges'].mean().reset_index()

fig = px.bar(x=regions['region'], y=regions['charges'], height=500, width=700, title='Average Medical Charges In US Regions')
fig.update_traces(marker_color=['darkorange','orange','gold','orangered','tomato'])
fig.show()

In [None]:
fig =px.scatter(df, x='bmi', y='charges', height=500, width=700, title='Medical Charges Of Different BMI')
fig.show()

In [None]:
avg_bmi = df.groupby('bmi')['charges'].mean().reset_index()

fig =px.scatter(x=avg_bmi['bmi'], color=avg_bmi['charges'],y=avg_bmi['charges'])
fig.show()

In [None]:
categorical = df.select_dtypes(exclude=['int64','float64']).columns
categorical

# Data Preprocessing

In [None]:
for i in categorical:
    df[i]=pd.factorize(df[i])[0]

In [None]:
df.head(3)

In [None]:
x = df.drop('charges',1)
y = df['charges']

In [None]:
print(x.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=101, test_size=0.2)

# Pipeline

In [None]:
lin = make_pipeline(StandardScaler(), LinearRegression())

laso = make_pipeline(StandardScaler(), Lasso())

dt = make_pipeline(StandardScaler(), DecisionTreeRegressor())

rfr = make_pipeline(StandardScaler(), RandomForestRegressor())


# Linear Regressor 

In [None]:
lin.fit(x_train, y_train)

print('Score of Linear Regression Model is: ',lin.score(x_test, y_test))

y_pred=lin.predict(x_test)

print('MAE Of Linear Regression Model is: ',mean_absolute_error(y_pred, y_test))

# Lasso Regressor

In [None]:
laso.fit(x_train, y_train)

print('Score of Lasso Regression Model is: ', laso.score(x_test, y_test))

y_pred1 = laso.predict(x_test)

print('MAE of Lasso Regression Model is: ', mean_absolute_error(y_pred1, y_test))

# DecisionTree Regressor

In [None]:
dt.fit(x_train, y_train)

print('Score Of DecisionTree Regressor Model is: ', dt.score(x_test, y_test))

y_pred2 =dt.predict(x_test)

print('MAE of DecisionTree Regressor Model is: ', mean_absolute_error(y_pred2, y_test))

# RandomForest Regressor

In [None]:
rfr.fit(x_train, y_train)

print('Score Of RandomForestRegressor Model is: ', dt.score(x_test, y_test))

y_pred3 =dt.predict(x_test)

print('MAE of RandomForest Regressor Model is: ', mean_absolute_error(y_pred3, y_test))