In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
src = r'../input/insurance.csv'
data = pd.read_csv(src)

# Basic data information

In [None]:
data.info()

There is no missing values. So there is no need to concern myself with missing values.

# Exploratory Data Analysis

## Charges by age and sex

In [None]:
sns.set_style("whitegrid", {'grid.linestyle': '--'})
plt.figure(figsize = (10,6))
sns.scatterplot(x = "age", y = "charges", data = data, hue = "sex")
plt.xlabel("Age")
plt.ylabel("Charges")
plt.title("Distribution of charges by age and sex")

The charges definitely do increase with respect to age. There is no clear difference in charges for male vs female.

## Charges by smoker and BMI

In [None]:
smokers = data["smoker"].unique()
colors = ["Reds", "Greens"]
for i, smoker in enumerate(smokers):
    temp = data[data["smoker"] == smoker]
    sns.scatterplot(temp["bmi"], temp["charges"], cmap = colors[i])
plt.legend(smokers)

Despite the BMI indicator is used to measure health risk for an individual, the feature is not as important as knowing whether the individual is a smoker or non-smoker. 

Smoker tends to incur a much higher charge as compared to non-smoker. When the BMI of a smoker goes beyond 30, the charges increases to a minimum of 30000. Non-smoker with BMI > 30 generally have charges incurred below 30000.

## Charges by region and sex

In [None]:
plt.figure(figsize = (10,5))
sns.boxplot(x = "region", y = "charges", hue = "sex", data = data)

Across south west, north west and north east, the charges are rather similar. However, individuals from south east has a wider range of charges for both sexes. Each individual should be categorized as either from southeast or not from southeast region.

## Charges by age and smoker

In [None]:
sns.set_style("whitegrid", {'grid.linestyle': '--'})
plt.figure(figsize = (10,6))
sns.scatterplot(x = "age", y = "charges", data = data, hue = "smoker")
plt.xlabel("Age")
plt.ylabel("Charges")
plt.title("Distribution of charges by age and sex")

Smokers are generally charge a much higher rate. Charges above 30000 are usually from smokers and below 15000 are generally non-smokers. Anything in between could be from smoker or non-smoker.

## Charges by smoker and number of children

In [None]:
plt.figure(figsize = (10,8))
sns.boxplot(x = "children", y = "charges",hue = "smoker", data = data)
plt.title("Distribution of charges by number of children")

There seems to be a charges threshold. Smokers generally make claims with minimum amount of approximately 15000 regardless of number of children whereas non-smokers make a maximum claim of 20000 (typically below 15000).

Smoking is definitely an important feature to take note in modelling.

## Correlation between features

In [None]:
sns.heatmap(data.corr(), annot = True)

The is no correlation between the different numerical features. 

# Modelling

## Pre-processing

In [None]:
data.head()

In [None]:
# Transforming categorical features to numerical values
data["smoker"] = data["smoker"].replace(["yes","no"], [1,0])
data["sex"] = data["sex"].replace(["male","female"], [1,0])
data["region_southeast"] = data["region"].apply(lambda x: 1 if x == "southeast" else 0)

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Data are split into training and test data
y_data = data["charges"]
x_data = data.drop(["charges","region"], axis = 1)
x_train, x_test, y_train ,y_test = train_test_split(x_data, y_data, test_size = 0.25)

# Model is trained and then used on test dataset
model1 = LinearRegression()
model1.fit(x_train, y_train)
y_pred = model1.predict(x_test)

# Coefficients and intercept of linear regression model extracted
model_coef = pd.DataFrame(data = model1.coef_, index = x_test.columns)
model_coef.loc["intercept", 0] = model1.intercept_ 
display(model_coef)

# Model's performance
model_performance = pd.DataFrame(data = [r2_score(y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))],
                                 index = ["R2","RMSE"])
display(model_performance)

Smoking is a huge factor as compared to other features. Just by being a smoker, the charge increases by approximately 25,000 regardless of age. Thereafter, the charges increase with age, bmi (higher health risk) and children (insurance is increasingly important with dependents).

In [None]:
residual = y_test - y_pred
# Positive residual means that the actual charge > predicted charge
# Negative residual means that the actual charge < predicted charge
plt.scatter(y_test, residual)
plt.title("Residual vs actual charges")
plt.xlabel("Actual charges")
plt.ylabel("Residual")

The model is great for charges less than 15,000 as the residual is within 5000. However, the model breaks down at higher charge (> 15,000) as the residual very significant. The model could over predict the actual charge by 10,000 or more, or under predict the actual charge by minimally 5,000.