In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from scipy.stats import norm, skew

# Data Description

**Age**: age of primary beneficiary

<br>**Sex**: insurance contractor gender, female, male

<br>**BMI**: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

<br>**Children**: Number of children covered by health insurance / Number of dependents

<br>**Smoker**: Smoking

<br>**Region**: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

<br>**Charges**: Individual medical costs billed by health insurance

In [None]:
insurance = pd.read_csv("../input/insurance.csv")
insurance.head(5)

Now let us look at the dimensions of the dataset.

In [None]:
insurance.shape

In [None]:
# Checking the data types of each variable
insurance.dtypes

In [None]:
insurance.info()

Here we can see that there are no missing values in the dataset. As all the variables are non null.

In [None]:
cor = insurance.corr(method='pearson')
cor

In [None]:
sns.heatmap(cor,square=True, vmin=-0.2, vmax=0.8,cmap="YlGnBu",annot=True)

In [None]:
#Setting the plot size 
fig, axis=plt.subplots(figsize=(7,7))

#We use kde = True to plot the gaussian kernel density estimate
sns.distplot(insurance['charges'],bins=50, kde=True)

In [None]:
from scipy import stats
g = sns.JointGrid(insurance['age'],insurance['charges'])
g = g.plot(sns.regplot, sns.distplot)
g = g.annotate(stats.pearsonr)

In [None]:
#Create a variable to hold the nams of the data types float64 and int64
num_cols=['int64','float64']
#Filter out variables with numeric data types
numcols_only=insurance.select_dtypes(include=num_cols)
numcols_only

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
num_std=scaler.fit_transform(numcols_only)


In [None]:
numcols_only.head(5)

## Dummy encoding of categorical variables

In [None]:
#We exclude all numeric columns
insurance_catcol = insurance.select_dtypes(exclude=num_cols)
insurance_catcol.columns.values

In [None]:
insurance_cat_dummies = pd.get_dummies(insurance_catcol, drop_first=True)
insurance_cat_dummies.head(3)

In [None]:
# Merging numerical variables and dummies generated for categorical variables

merged= pd.concat([numcols_only, insurance_cat_dummies], axis=1)

merged.head()

In [None]:
merged.shape

In [None]:
from sklearn.model_selection import train_test_split

#Create feature & response variables
X= merged.drop('charges',axis=1)
Y= merged['charges']

#Create train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.20, random_state=1)

In [None]:
X.head(3)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn import datasets, linear_model, metrics 

reg = linear_model.LinearRegression() 
#We fit our model with train data
reg.fit(X_train, Y_train)

#We use predict() to predict our values
Y_hat=reg.predict(X_test)

#We can also check the coefficient of determination with r2_score()
# from sklearn.Matrics

from sklearn.metrics import r2_score
print(r2_score(Y_test,Y_hat))

#Closer the value to 1 the better it is.

In [None]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(Y_hat, Y_test)
rmse=np.sqrt(mse)
print(rmse)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(Y_test,Y_hat)
plt.xlabel('Actual value of Charges')
plt.ylabel('Predicted values of charges')
plt.tight_layout()