In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
df=pd.read_csv('../input/insurance/insurance.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['sex']=df.sex.replace({'male':1,'female':2})
df['smoker']=df.smoker.replace({'yes':1,'no':2})

**Correlation Heatmap**

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(df.corr(),cmap='crest',linewidths=.5,annot=True)

**Distribution of Numeric Variable**

In [None]:
fig, axes=plt.subplots(4,1,figsize=(16,9),sharex=False,sharey=False)
sns.boxplot(x='age',data=df,ax=axes[0],palette='crest')
sns.boxplot(x='bmi',data=df,ax=axes[1],palette='crest')
sns.boxplot(x='children',data=df,ax=axes[2],palette='crest')
sns.boxplot(x='charges',data=df,ax=axes[3],palette='crest')

**Relation of Age and Charges by Smoker**

In [None]:
plt.figure(figsize=(16,9))
sns.scatterplot(x='age',y='charges',hue='smoker',data=df)

**Relation of BMI and Charges by Smoker**

In [None]:
plt.figure(figsize=(16,9))
sns.scatterplot(x='bmi',y='charges',hue='smoker',data=df)

**Number of Children**

In [None]:
plt.figure(figsize=(16,9))
sns.barplot(x='children',y='charges',data=df,palette='crest',ci=None)

**Average Charges by Smoker**

In [None]:
plt.figure(figsize=(16,9))
sns.barplot(x='smoker',y='charges',data=df,palette='crest',ci=None)

**Linear Regression**

In [None]:
x=df[['age','sex', 'bmi', 'children', 'smoker']]
y=df['charges']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
reg=LinearRegression().fit(x_train, y_train)
x_train=x_train.values.reshape(-1,5)
y_train=y_train.values.reshape(-1,5)

In [None]:
reg.score(x_test,y_test)

In [None]:
np.corrcoef(reg.predict(x_test),y_test)

In [None]:
plt.figure(figsize=(16,9))
sns.scatterplot(reg.predict(x_test),y_test)

**Regression Tree**

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
regressor = DecisionTreeRegressor(random_state=0).fit(x_train, y_train)

In [None]:
regressor.score(x_test,y_test)

In [None]:
np.corrcoef(regressor.predict(x_test),y_test)

In [None]:
plt.figure(figsize=(16,9))
sns.scatterplot(regressor.predict(x_test),y_test)