# 1. Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# 2. Import and read data

In [None]:
data = pd.read_csv("../input/insurance/insurance.csv")
data.head()

In [None]:
print("Shape of dataframe: ", data.shape)

# 3. Data description

- age: Age of primary beneficiary
- sex: Insurance contractor gender
- bmi: Body mass index
- children: Number of children covered by health insurance / Number of dependents
- smoker: Smoking
- region: The beneficiary's residential area in the US
- charges: Individual medical costs billed by health insurance

# 4. Check missing values, data types and summary statistics

In [None]:
data.info()

In [None]:
# Missing data

data.isnull().sum()

Hooray, no missing values!

In [None]:
# Data types

data.dtypes

In [None]:
# Data type value counts

data.dtypes.value_counts()

We have 3 categorical variables and 4 numerical variables.

In [None]:
# Summary statistics

data.describe().transpose()

# 5. Feature engineering

Feature engineering is the process of creating new features using existing features in the dataset. In this section, I will create 3 new features:

- age_category
- weight_condition
- dependent

In [None]:
# Age category

data.loc[(data['age'] >= 18) & (data['age'] < 36), 'age_category'] = 'youth'
data.loc[(data['age'] >= 36) & (data['age'] <= 55), 'age_category'] = 'adults'
data.loc[data['age'] > 55, 'age_category'] = 'seniors'
data['age_category'].value_counts()

In [None]:
# Weight condition

data.loc[data['bmi'] < 18.5, 'weight_condition'] = 'underweight'
data.loc[(data['bmi'] >= 18.5) & (data['bmi'] < 25), 'weight_condition'] = 'normal weight'
data.loc[(data['bmi'] >= 25) & (data['bmi'] < 30), 'weight_condition'] = 'overweight'
data.loc[data['bmi'] >= 30, 'weight_condition'] = 'obese'
data['weight_condition'].value_counts()

In [None]:
# Dependent

data.loc[data['children'] > 0, 'dependent'] = 'yes'
data.loc[data['children'] == 0, 'dependent'] = 'no'
data['dependent'].value_counts()

In [None]:
data.head()

In [None]:
# Missing data

data.isnull().sum()

In [None]:
# Data types 

data.dtypes

In [None]:
# Data type value counts

data.dtypes.value_counts()

After feature engineering, we now have 3 additional categorical variables, bringing the total to 6 categorical variables as well as 4 numerical variables.

# 6. Visualise distribution

Categorical variables

Recall the categorical variables in our dataset include:
- sex
- smoker
- region
- age_category
- weight_condition
- dependent

In [None]:
data['sex'].value_counts()

In [None]:
plt.figure(figsize = (7, 6))
sns.countplot(data['sex'])
plt.title('Sex Value Counts')

There is almost an equal distribution between male and female in the dataset.

In [None]:
data['smoker'].value_counts()

In [None]:
plt.figure(figsize = (7, 6))
sns.countplot(data['smoker'])
plt.title('Smoker Value Counts')

There are more non-smokers than there are smokers in the dataset.

In [None]:
data['region'].value_counts()

In [None]:
# Countplot with percentage

total = len(data)
plt.figure(figsize = (10, 6))
plt.title('Region Value Counts')
ax = sns.countplot(x = 'region', data = data)
for p in ax.patches:
    percentage = '{0:.0f}%'.format(p.get_height() / total * 100)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height() + 5
    ax.annotate(percentage, (x, y), ha = 'center')
plt.show()

There is almost an equal distribution between the 4 regions with southeast having slightly more people.

In [None]:
data['age_category'].value_counts()

In [None]:
total = len(data)
plt.figure(figsize = (7, 6))
plt.title('Age Category Value Counts')
ax = sns.countplot(x = 'age_category', data = data)
for p in ax.patches:
    percentage = '{0:.0f}%'.format(p.get_height() / total * 100)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height() + 5
    ax.annotate(percentage, (x, y), ha = 'center')
plt.show()

There is almost and equal distribution between youth and adults. Seniors have the least number of people.

In [None]:
data['weight_condition'].value_counts()

In [None]:
total = len(data)
plt.figure(figsize = (10, 6))
plt.title('Weight Condition Value Counts')
ax = sns.countplot(x = 'weight_condition', data = data)
for p in ax.patches:
    percentage = '{0:.0f}%'.format(p.get_height() / total * 100)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height() + 5
    ax.annotate(percentage, (x, y), ha = 'center')
plt.show()

More than half the beneficiaries are considered obese.

In [None]:
data['dependent'].value_counts()

In [None]:
total = len(data)
plt.figure(figsize = (7, 6))
plt.title('Dependent Value Counts')
ax = sns.countplot(x = 'dependent', data = data)
for p in ax.patches:
    percentage = '{0:.0f}%'.format(p.get_height() / total * 100)
    x = p.get_x() + p.get_width() / 2
    y = p.get_height() + 5
    ax.annotate(percentage, (x, y), ha = 'center')
plt.show()

Majority of the  beneficiaries have children.

Factorplot allows us to further break down a categorical variable using another categorical variable.

In [None]:
sns.set_style('whitegrid')
sns.catplot('sex', col = 'smoker', hue = 'age_category', data = data, kind = 'count')

Here, I have divided the population up by their gender, smoking habits and age category.

As we can observe from the plot, there are more male smokers than there are female smokers. Most smokers belong to the youth and adults age categories.

In [None]:
sns.set_style('whitegrid')
sns.catplot(x = 'smoker', col = 'region', data = data, kind = 'count', aspect = 0.6)

Southeast region has the highest number of smokers.

Numerical variable

Recall the numerical variables in our dataset include:
- age
- bmi
- children
- charges

In [None]:
sns.set_style('white')
plt.figure(figsize = (12, 4))
sns.boxplot(y = data['smoker'], x = data['charges'])
plt.title('Charges by Smoking Habits')

As expected, smokers pay a higher premium than non-smokers.

In [None]:
sns.set_style('white')
plt.figure(figsize = (12, 5))
sns.boxplot(y = data['age_category'], x = data['charges'])
plt.title('Charges by Age Category')

People pay more for health insurance as they get older.

In [None]:
sns.set_style('white')
sns.distplot(data['bmi'], label = 'Skewness: %.2f'%(data['bmi'].skew()))
plt.legend(loc = 'best')
plt.title('BMI Distribution')

BMI follows a normal distribution but the average BMI for this population is considered overweight or obese. This comes at no surprise as the United States has one of the highest levels of obesity in the world.

In [None]:
print("Average BMI: {:.2f}".format(data['bmi'].mean()))

In [None]:
sns.set_style('white')
sns.distplot(data['bmi'], label = 'Skewness: %.2f'%(data['bmi'].skew()), kde = False)
plt.legend(loc = 'best')
plt.title('BMI Distribution')

In [None]:
sns.set_style('white')
sns.kdeplot(data.loc[data['sex'] == 'male', 'charges'], label = 'Male', shade = True)
sns.kdeplot(data.loc[data['sex'] == 'female', 'charges'], label = 'Female', shade = True)
plt.xlabel('Charges')
plt.title('Charges by Gender')

Females pay higher premiums than male. This could be due to the fact that there are more male smokers than there are female smokers.

In [None]:
plt.figure(figsize = (10, 4))
sns.kdeplot(data.loc[data['age_category'] == 'youth', 'charges'], label = 'Youth', shade = True)
sns.kdeplot(data.loc[data['age_category'] == 'adults', 'charges'], label = 'Adults', shade = True)
sns.kdeplot(data.loc[data['age_category'] == 'seniors', 'charges'], label = 'Seniors', shade = True)
plt.xlabel('Charges')
plt.title('Charges by Age Category')

As we have seen earlier, premium level increases with age.

# Visualise relationship

In [None]:
correlation = correlation = data[['age', 'bmi', 'children', 'charges']].corr()
sns.heatmap(correlation, annot = True, fmt = '.2f', cmap = 'Blues')
plt.title('Correlation Between Numerical Variables')

The diagonal of a heatmap is always one because they represent the correlation between a variable and itself. The diagonal can also be seen as a mirror between the bottom triangle and the top triangle. If you look closely, the two triangles contain the same set of information.

There are two ways to interpret a heatmap: by reading across the columns or by reading down the rows.

As we can see from the charges column/row, age is the most correlated feature to charges, followed by bmi and finally children.

In [None]:
correlation['charges'].sort_values(ascending = False).drop('charges')

In [None]:
sns.set_style('white')
plt.figure(figsize = (7, 6))
sns.barplot(x = 'region', y = 'charges', data = data)
plt.title('Charges by Region')

Southeast region pays the highest premium. This could be due to the fact that the region has the highest number of smokers which in turn drove up the average premium levels.

In [None]:
sns.set_style('white')
plt.figure(figsize = (7, 6))
sns.barplot(x = 'dependent', y = 'charges', data = data)
plt.title('Charges by Dependent')

Policyholders that have children pay a higher premium than those without children. Not exactly sure why.

In [None]:
sns.jointplot(x = 'bmi', y = 'charges', data = data, kind = 'kde')

The darker region represents the majority of the population.

In [None]:
plt.figure(figsize = (7, 6))
sns.scatterplot(x = 'bmi', y = 'charges', hue = 'smoker', data = data)

In [None]:
obese_smoker = data.loc[(data['weight_condition'] == 'obese') & (data['smoker'] == 'yes'), :]
obese_nonsmoker = data.loc[(data['weight_condition'] == 'obese') & (data['smoker'] == 'no'), :]

In [None]:
sns.set_style('white')
plt.figure(figsize = (7, 6))
sns.scatterplot(x = 'age', y = 'charges', data = obese_smoker, label = 'Obese Smoker')
sns.scatterplot(x = 'age', y = 'charges', data = obese_nonsmoker, label = 'Obese Non-Smoker')

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize = (7, 6))
sns.regplot(x = 'bmi', y = 'charges', data = data)
plt.title('Charges Against BMI')

The linear line has a positive slope, suggesting a positive relationship between bmi and charges. The higher the BMI, the higher the premium for health insurance. 

In [None]:
sns.set_style('white')
sns.lmplot(x = 'age', y = 'charges', hue = 'age_category', data = data)

Here, I have plotted a lmplot of charges against age but further categorised by age categories. 

There is a positive relationship between age and charges. Senior citizens have a much steeper increase due to the higher risks of health complications at older ages. 

In [None]:
sns.lmplot(x = 'bmi', y = 'charges', hue = 'smoker', data = data)

Here, I have plotted a lmplot of charges against bmi but further categorised by smoking status.

Again, we see the positive relationship between bmi and charges but a dramatically steeper hike for smokers in comparison to non-smokers. We can conclude that smoking significantly raises the premium levels for health insurance. 

In [None]:
sns.set_style('white')
plt.figure(figsize = (7, 6))
sns.swarmplot(x = data['smoker'], y = data['charges'])
plt.title('Charges by Smoking Habits')

In [None]:
plt.figure(figsize = (10, 6))
sns.set_style('whitegrid')
sns.violinplot(x = 'weight_condition', y = 'charges', hue = 'smoker', data = data, split = True)
sns.despine(left = True)

In [None]:
sns.set_style('white')
plt.figure(figsize = (7, 6))
sns.pointplot(x = 'weight_condition', y = 'charges', hue = 'smoker', data = data)

Obese smokers pay a significantly higher premium.

In [None]:
sns.pairplot(data, hue = 'smoker')

Here, we can examine how smoking correlates with the other numerical variables in our dataset. 

# Linear regression

## Data preparation

In [None]:
data = pd.read_csv("../input/insurance/insurance.csv")

data['sex'] = data['sex'].map({'male': 0, 'female': 1})
data['smoker'] = data['smoker'].map({'yes': 1, 'no': 0})
data['Age_range']=pd.cut(data['age'],bins=[17,35,55,1000],labels=['Young adult','Senior Adult','Elder'])

# Get the dummy variables for region and age range
region=pd.get_dummies(data.region,drop_first=True)
Age_range=pd.get_dummies(data.Age_range,drop_first=True)
children= pd.get_dummies(data.children,drop_first=True,prefix='children')

# Add the results to the original bike dataframe
data=pd.concat([region,Age_range,children,data],axis=1)

#Drop region and age range as we are created a dummy
data.drop(['region', 'Age_range', 'age','children'], axis = 1, inplace = True)

## Splitting the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# We specify this so that the train and test data set always have the same rows, respectively
#np.random.seed(0)
data_train, data_test = train_test_split(data, train_size = 0.7, random_state = 100)

## Rescaling the Features
#### Min-Max scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

#Instantiate an object
scaler = MinMaxScaler()

#Create a list of numeric variables
num_vars=['bmi','charges']

#Fit on data
data_train[num_vars] = scaler.fit_transform(data_train[num_vars])
data_train.head()

Dividing into X and Y sets for the model building

In [None]:
#Divide the data into X and y
y_train = data_train.pop('charges')
X_train = data_train

### Building a Linear Model
Recursive Feature Elimination

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Running RFE with the output number of the variable equal to 15
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 8)             # running RFE
rfe = rfe.fit(X_train, y_train)


# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]

# Adding a constant variable 
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

# Running the linear model 
lm = sm.OLS(y_train,X_train_rfe).fit()


### Checking VIF

Variance Inflation Factor or VIF, gives a basic quantitative idea about how much the feature variables are correlated with each other. It is an extremely important parameter to test our linear model

In [None]:
#Drop the constant term B0
X_train_rfe = X_train_rfe.drop(['const'], axis=1)

# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Now lets drop the variable having high p value. We can see children_5 has high p value

In [None]:
#Drop children_5
X_train_new1 = X_train_rfe.drop(["children_5"], axis = 1)

Rebuilding model

In [None]:
#Build a model
X_train_lm1 = sm.add_constant(X_train_new1)
lm1 = sm.OLS(y_train,X_train_lm1).fit()

#Drop the constant term B0
X_train_lm1 = X_train_lm1.drop(['const'], axis=1)


# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = X_train_lm1
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Lets drop children_4 due to its p value

In [None]:
X_train_new2 = X_train_lm1.drop(['children_4'], axis=1)

In [None]:
#Build a model
X_train_lm2 = sm.add_constant(X_train_new2)
lm2 = sm.OLS(y_train,X_train_lm2).fit()

#Drop the constant term B0
X_train_lm2 = X_train_lm2.drop(['const'], axis=1)

# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = X_train_lm2
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


Lets drop children 3

In [None]:
X_train_new3 = X_train_lm2.drop(['children_3'], axis=1)

#Build a model
X_train_lm3 = sm.add_constant(X_train_new3)
lm3 = sm.OLS(y_train,X_train_lm3).fit()

#Drop the constant term B0
X_train_lm3 = X_train_lm3.drop(['const'], axis=1)

# Calculate the VIFs for the new model
vif = pd.DataFrame()
X = X_train_lm3
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


Now model is good with p-values and VIF under the acceptable range

### Making Predictions

Applying the scaling on the test sets

In [None]:
# Now let's use our model to make predictions.

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train_new3.columns]

# Adding a constant variable 
X_test_new1 = sm.add_constant(X_test_new)

In [None]:
# Making predictions
y_pred = lm3.predict(X_test_new1)

### Finding R-squared and Adjusted R-Squared for Test set

In [None]:
#Evaluate R-square for test
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

#Adjusted R^2
#adj r2=1-(1-R2)*(n-1)/(n-p-1)

#n =sample size , p = number of independent variables
n = X_test.shape[0]
p = X_test.shape[1]


Adj_r2=1-(1-0.75783003115855)*(n-1)/(n-p-1)
print(Adj_r2)

### Model Evaluation

In [None]:
# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
plt.figure(figsize=(15,8))
plt.scatter(y_test,y_pred,color='blue')
fig.suptitle('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=16)     # Y-label
plt.show()


In [None]:
#Regression plot
plt.figure(figsize=(14,8))
sns.regplot(x=y_test, y=y_pred, ci=68, fit_reg=True,scatter_kws={"color": "blue"}, line_kws={"color": "red"})

plt.title('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=16)                          # Y-label
plt.show()


### Final Result Comparison between Train model and Test:

Train R^2 : 0.723

Train Adjusted R^2 : 0.722

Test R^2: 0.762

Test Adjusted R^2: 0.749

Difference in R^2 between train and test: 3.9%

Difference in adjusted R^2 between Train and test: 2.7 % which is less than 5%



### Results:

- From the regression analysis, we ﬁnd that region and gender do not bring signiﬁcant difference on charges.

- Age, BMI, number of children and smoking are the ones that drive the charges

- Smoking seems to have themost inﬂuence on the medical charges