In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Import datase
health_data = pd.read_csv('../input/insurance/insurance.csv')
health_data.head()

In [None]:
health_data.describe()

In [None]:
health_data.dtypes

In [None]:
health_data.isnull().sum()

In [None]:
health_data.shape

### Step 1 : EDA 

**Checking distribution of Total Charges**

EDA for Smokers:

In [None]:
plt.figure(figsize = (12,8))
sns.set(style = 'whitegrid')
sns.distplot(health_data['charges'], kde=True)
plt.title('Total Charges Distribution')

Total Distribution data is right skewed. To make it normal, natural log can be applied.

In [None]:
plt.figure(figsize = (12,8))
sns.set(style = 'whitegrid')
sns.distplot(np.log10(health_data['charges']), kde=True, color = 'g')
plt.title('Total Charges Distribution - After Applying Log')

Now data is focused at the center.

In [None]:
## Check distribution of data for Smoker and Non-Smokers

f = plt.figure(figsize=(12,8))

ax = f.add_subplot(121)
sns.distplot(health_data[(health_data.smoker == 'yes')]["charges"],color='r',ax=ax)
ax.set_title("Distribution Charges for Smokers")

ax = f.add_subplot(122)
sns.distplot(health_data[(health_data.smoker == 'no')]["charges"],color='g',ax=ax)
ax.set_title("Distribution Charges for Non-Smokers")


From data it is evident that Smokers spend more money for Health Care than the Non-smokers.

In [None]:
f, ax = plt.subplots(1, 1, figsize=(10, 5))
ax = sns.countplot(x='smoker', hue='sex', data=health_data, palette='cool')

Number of Male Smokers is larger than the Female Smokers. So it can be said that treatment cost in Men will be higher than the Female.

Lets have a look at the distribution of Data using Age as the parameter.

In [None]:
plt.figure(figsize= (10,5))
plt.title("Age Distribution")
ax = sns.distplot(health_data["age"], color = 'r')

Age is distributed from 18(minimum) to 65 (max) age. 
Now to check the effect of smoking on patients between age of 18 to 20.

In [None]:
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow", data=health_data[(health_data.age >= 18) & (health_data.age <= 20)])
plt.title("The number of smokers and non-smokers (18-20 years old)")

There are smokers in the age of 18-20 as well.

Need to check what is the effect of smoking on Health Treatment cost for the patiients in the age of 18-20.

In [None]:
plt.figure(figsize = (10,5))
plt.title('Charges for 18-20 Age Patients who smoke')
sns.boxplot(x='charges', y='smoker', data=health_data[(health_data.age >= 18) & (health_data.age <= 20)])

Boxplot distribution shows that patients between age of 18 to 20 spend a lot more than patients those who dont smoke.

There are some outliers for Non-smokers, this can be indicator that these patients have terminal health issues.

EDA using BMI:

In [None]:
plt.figure(figsize= (10,5))
plt.title("BMI distribution")
ax = sns.distplot(health_data["bmi"], color = 'g')

A BMI of less than 18.5 means that a person is underweight. A BMI of between 18.5 and 24.9 is ideal. A BMI of between 25 and 29.9 is overweight. A BMI over 30 indicates obesity.

Here Average BMI is 30 which is an indication of obesity.

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of charges for patients with BMI greater than 25")
ax = sns.distplot(health_data[(health_data.bmi >= 25)]['charges'], color = 'm')

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of charges for patients with BMI greater than 25")
ax = sns.distplot(health_data[(health_data.bmi <= 25)]['charges'], color = 'm')

Patients with BMI > 25 spend more on Health care than the patients with BMI < 25.

Lets compare all the variables : Age, BMI, Smoking, Children

In [None]:
ax =sns.lmplot(x = 'age', y='charges', data = health_data, hue = 'smoker')

In [None]:
ax =sns.lmplot(x = 'bmi', y='charges', data = health_data, hue = 'smoker')

In [None]:
ax =sns.lmplot(x = 'children', y='charges', data = health_data, hue = 'smoker')

This confirms that smoking causes more money on Health Care.

Also the cost of treatment increases as Age, BMI increases.

### Step 2 : Encoding the categorical variables:

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

#Sex
le.fit(health_data['sex'].drop_duplicates())
health_data['sex'] = le.transform(health_data['sex'])

#smoker
le.fit(health_data['smoker'].drop_duplicates())
health_data['smoker'] = le.transform(health_data['smoker'])

#region
le.fit(health_data['region'].drop_duplicates())
health_data['region'] = le.transform(health_data['region'])


In [None]:
health_data.dtypes

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(health_data.corr(), annot = True, cmap = 'YlGnBu')

There are no high correlations except with "smoker" column.

### Step 3 : Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X = health_data.drop(['charges'], axis =1)
y = health_data['charges'] 

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=42)

**1.Linear Regression Model**

In [None]:
lr = LinearRegression()
lr_model = lr.fit(X_train, y_train)

#Prediction on train data
y_train_pred = lr_model.predict(X_train)

#Prediction on test data
y_test_pred = lr_model.predict(X_test)

In [None]:
# Accuracy Score
print(lr_model.score(X_test, y_test))

We are getting a accuracy of 76%. We can increase accuracy using other algorithms.

**2. Linear Regression Using Polynomial Features**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score,mean_squared_error

In [None]:
polynomial = PolynomialFeatures(degree=2)
polynomial_model = polynomial.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(polynomial_model,y, train_size=0.7, random_state=42)

# Build second LR model using polynomial features
lr_model_2 = LinearRegression().fit(X_train,y_train)


#PRedict the values
y_train_pred = lr_model_2.predict(X_train)

#Predict test values
y_test_pred = lr_model_2.predict(X_test)

In [None]:
print(lr_model_2.score(X_test, y_test))

In [None]:
polynomial_LR_model = pd.DataFrame({'Actual Values': y_test, 'Predicted Values':y_test_pred})

We are up to 86% accuracy which is pretty good.

**3. Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfe = RandomForestRegressor(n_estimators =100,
                           criterion = 'mse',
                           random_state =42,
                           n_jobs=-1)

In [None]:
rfe.fit(X_train, y_train)

# predict train data
y_pred_train = rfe.predict(X_train)

# predict test data 
y_test_pred = rfe.predict(X_test)


In [None]:
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train,y_pred_train),
r2_score(y_test,y_test_pred)))

Random Forest gives about 85% accuracy on the test.

### Linear Regression using Polynomial Features gives about 88% Accuracy, which is highest amongst the three trained models.

### Hence, we will consider this model for the predictions.

In [None]:
polynomial_LR_model.head(10)