In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here are several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression # building our linear regression model to predict health insurance costs
from sklearn.model_selection import train_test_split # to split our data into a training set and test set to see how well our model performs
from matplotlib import pyplot as plt # data visualization
import seaborn as sns # enhanced data visualization


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/insurance'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis
First, load the data from a csv file and get a feel for what it looks like by looking at the first and last rows.

In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [None]:
df.rename(columns={'children': 'num_children'}, inplace=True)
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

# Data Cleaning
Make a copy of the original dataframe. This will allow us to clean the data without affecting the original, raw data.

In [None]:
df2 = df.copy()

Convert string categories into coded numerical variables.

In [None]:
df2['sex'] = df2['sex'].astype('category').cat.codes

df2['smoker'] = df2['smoker'].astype('category').cat.codes

df2.head()

The only string variable left is 'region'. We can change this to a numerical variable by using the get_dummies command in the pandas library. This will pivot the region categories so that each category will be a binary variable as shown below.

In [None]:
df3 = pd.get_dummies(df2, columns=['region'], prefix='region')
df3.head()

# Data Visualization

In [None]:
#plot the relationship between insurance charges and the quantitative variables: age, bmi, num_children

#create three sets of axes using subplots
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(20,5))

#create a custom plot for each set of axes
for item, i, color in list(zip(['age', 'bmi', 'num_children'], list(range(3)), ['blue', 'orange', 'purple'])):
    sns.regplot(ax=axes[i], data=df, x=df[item], y=df['charges'], scatter_kws={'alpha':0.2}, color=color)
    axes[i].set_title(item + ' vs charges')
    axes[i].set_xlim(df[item].min() - .5, df[item].max() + 1)

#make it so the 'charges' y-axis label is only on the left-most set of axes
for ax in axes.flat:
    ax.label_outer()

plt.tight_layout()    
plt.show()

In [None]:
#plot the charges against the categorical variables: sex, smoker, region
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(20,5))

for item, i, color in list(zip(['sex', 'smoker', 'region'], list(range(3)), ['blue', 'orange', 'purple'])):
    sns.barplot(ax=axes[i], data = df, x=df[item], y=df['charges'], color=color, edgecolor='black', linewidth=1.2, errcolor='green', errwidth=4)
    axes[i].set_title('charges by ' + item)

for ax in axes.flat:
    ax.label_outer()

plt.tight_layout()    
plt.show()

In [None]:
df3.head()

# Perform Multiple Linear Regression
Separate our dataframe out into independent variables (X) and dependent variable (y). Then split the data into a training set and a test set before fitting the model and running it on test data to evaluate performance.

In [None]:
X = df3[['age', 'sex', 'bmi', 'num_children', 'smoker', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']]
#X = df3[['age', 'sex', 'bmi', 'num_children', 'smoker']]
y = df3['charges']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
#independent_variables = ['age', 'sex', 'bmi', 'num_children', 'smoker', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']
independent_variables = ['age', 'sex', 'bmi', 'num_children', 'smoker']
print(independent_variables)
print('Intercept: \n', model.intercept_)
print('Coefficients: \n', model.coef_)

# Results
The score below indicates that over 76% of the variability in our variable of interest (charges) can be explained by our independent variables (age, sex, bmi, number of children, smoker status, and region).

In [None]:
print(model.score(X_test, y_test))

# Apply the model to new data
Using the model on new patients, we can attempt to predict their health insurance costs. Below are the results for patients who are identical except for smoking status.

In [None]:
#predict charges for two new individuals, patient 0 and patient 1
new_patients = pd.DataFrame({'age': [29, 29], 'sex': [1, 1], 'bmi': [22, 22], 'num_children': [1, 1], 'smoker': [0, 1], 'region_northeast': [0, 0], 'region_northwest': [0, 0], 'region_southeast': [1, 1], 'region_southwest': [0, 0]})
new_patients

# Analysis
We can see below that smoking alone increased changes 10x. The non-smoker predicted cost of 2,365 is much smaller than the smoker's predicted cost of 25,905.

In [None]:
predictions = model.predict(new_patients)
print(predictions)

In [None]:
def calculate_insurance_cost_difference(predictions):
    p_rounded = []
    for p in predictions:
        p_rounded.append(round(p, 2)) 
    print("patient 1 cost: " + str(p_rounded[0]))
    print("patient 2 cost: " + str(p_rounded[1]))
    print("The difference in insurance cost between " + str(p_rounded[0]) + " and " + str(p_rounded[1]) + " is " + str(round(abs(predictions[1] - predictions[0]), 2)) + " dollars.")
    return abs(round(predictions[1] - predictions[0], 2))

In [None]:
calculate_insurance_cost_difference(predictions)

In [None]:
for i in range(len(predictions)):
    print('The estimated insurance cost for patient ' + str(i) + ' is ${}.'.format("{:.2f}".format(predictions[i])))