In [None]:
#Importing Libraries
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import shap

In [None]:
shap.initjs()

In [None]:
plt.style.use('Solarize_Light2')

# Reading Data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/insurance/insurance.csv')
data.head()

In [None]:
data.info()

# EDA: Exploratory Data Analysis

## Features Correlation

In [None]:
plt.figure(figsize=(10,5))
heatmap = sns.heatmap(data.corr(), annot=True, fmt=".1f")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=12)
plt.title('Correlation Matrix', fontsize=18)
plt.show()

## Age

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(data['age'], kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.violinplot(data=data, x='sex', y='age', hue='smoker')
plt.title("Age Distributions Violinplot")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.violinplot(data=data, x='region', y='age')
plt.title('Age Distributions Violinplot')
plt.show()

Following the previous plots, we can say that patients' ages are regularly distributed.

## Sex

In [None]:
plt.figure(figsize=(15,5))
sns.catplot(data=data, x='sex', hue='smoker', col='region', kind='count', col_wrap=2)
plt.suptitle('Patients Sex Distribution "" Countplots', fontsize=15)
plt.subplots_adjust(top=0.9)
plt.show()

As the previous countplots show, there is an equal distribution of male and female patients. But for smokers and non-smokers, the second type represents a majority, and this is available for both males and females and for all regions (almost 7/8 of the data is about non smokers patients).

## Body Mass Index (BMI)

In [None]:
fig = px.histogram(data, 
                   x='bmi', 
                   marginal='box', 
                   title='BMI Distribution')
fig.show()

Most of the patients' BMI is around 30, which is the average index.

In [None]:
fig = px.histogram(data, x='bmi', 
                   color='sex', 
                   marginal='box', 
                   title='BMI Distribution Over Patients Sex')
fig.show()

Similar BMI distribution for both males and females patients.

In [None]:
fig = px.histogram(data, x='bmi', 
                   color='region', 
                   marginal='box', 
                   title='BMI Distribution Over Patients Region')
fig.show()

The southeast has higher BMI values than the 3 other regions that have similar BMI distributions.

In [None]:
fig = px.histogram(data, x='bmi', 
                   color='smoker', 
                   marginal='box', 
                   title='BMI Distribution Following Smoking Activity')
fig.show()

## Number of Children

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=data, x='children')
plt.title('Number Of Children Countplot')
plt.show()

We can observe that few are patients that have many children (3 or more) while the majority have no children or 2 at maximum.

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=data, x='region', hue='children')
plt.title('Number of Children Distribution Over Regions')
plt.show()

Similar distributions over different Regions.

## Smoker

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=data, x='smoker')
plt.title("Smokers' Countplot")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.catplot(data=data,
            x='smoker', 
            y='age',
            hue='sex', 
            col='region', 
            kind='box', 
            col_wrap=2)
plt.suptitle('Smokers Age Distributions', fontsize=15)
plt.subplots_adjust(top=0.9)
plt.show()

## Region

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(data=data, x='region')
plt.title("Regions' Countplot")
plt.show()

Southeast region has few more data than the other regions who are equally distributed.

## Charges (target variable)

In [None]:
fig = px.histogram(data, 
                   x='charges', 
                   marginal='box', 
                   title='Charges Distribution')
fig.show()

In [None]:
fig = px.density_contour(data, x="age", y="charges",
                         facet_col="sex",
                         color='smoker',
                         marginal_x="histogram",
                         marginal_y="histogram")
fig.show()

From the previous plot we can observe that:

- The majority of charges values are concentrated around 10k.
- The higher the age is, the more charges are.
- Smokers have more charges than non-smokers.
- The charges distribution of male and female are similar.

In [None]:
fig = plt.figure(figsize=(11,5))
g = sns.jointplot(data=data, x='bmi', y='charges', kind='kde', color='g', height=9)
g.fig.suptitle('Charges as function of BMI', fontsize=15)
g.fig.subplots_adjust(top=0.95)
plt.show()

The majority of data is centered around (BMI=30, Charges=10k).

In [None]:
fig = px.scatter(data, x="bmi", y="charges", color="smoker",
                 size='charges')
fig.update_layout(title_text='charges = f(bmi)')
fig.show()

The highest BMI values correspond to the highest charges and, as we noticed in a previous plot, smokers are likely to have the highest charges.

In [None]:
fig = go.Figure()

for n_children in data.children.unique():
    df = data.loc[data.children == n_children]
    fig.add_trace(go.Box(
        y=df.charges.values,
        name=str(n_children),
        boxpoints='all',
        jitter=0.5,
        whiskerwidth=0.2,
        #fillcolor=cls,
        marker_size=2,
        line_width=1)
    )
fig.update_layout(title_text="Charges Distribution Over Possible Numbers of Children")
fig.show()

In [None]:
fig = go.Figure()

for region in data.region.unique():
    df = data.loc[data.region == region]
    fig.add_trace(go.Box(
        y=df.charges.values,
        name=region,
        boxpoints='all',
        jitter=0.5,
        whiskerwidth=0.2,
        #fillcolor=cls,
        marker_size=2,
        line_width=1)
    )
fig.update_layout(title_text="Charges Distribution Over Regions")
fig.show()

## Features Engineering

Body Mass Index classes:

- `18.5 to 24.9`: Normal Weight.
- `25 to 29.9`: Overweight.
- `30 to 34.9`: Obesity Class 1.
- `35 to 39.9`: Obesity Class 2.
- `>= 40` : Obesity Class 3.

In [None]:
new_data = data.copy()

In [None]:
new_data.loc[new_data['bmi']<25, 'bmi_class'] = 'Normal'
new_data.loc[(new_data['bmi']<30) & (new_data['bmi']>=25), 'bmi_class'] = 'Overweight'
new_data.loc[(new_data['bmi']<35) & (new_data['bmi']>=30), 'bmi_class'] = 'Class 1'
new_data.loc[(new_data['bmi']<40) & (new_data['bmi']>=35), 'bmi_class'] = 'Class 2'
new_data.loc[new_data['bmi']>=40, 'bmi_class'] = 'Class 3'

Age Classes:

- `18 to 30`: Young
- `30 to 50`: Adult
- `> 50`: Old

In [None]:
new_data.loc[(new_data['age']<31) & (new_data['age']>=18), 'age_class'] = 'Young'
new_data.loc[(new_data['age']<51) & (new_data['age']>=31), 'age_class'] = 'Adult'
new_data.loc[new_data['age']>=51, 'age_class'] = 'Old'

In [None]:
new_data.head()

In [None]:
bmi_classes_data = new_data.groupby('bmi_class').count()
fig = px.pie(values=bmi_classes_data['age'].values, names=bmi_classes_data['age'].index,
             title='BMI Classes Distribution')
fig.show()

The majority of patients suffer from obesity (class 1 + class 2 + class 3 ~ 53%).

In [None]:
age_classes_data = new_data.groupby('age_class').count()
fig = px.pie(values=age_classes_data['age'].values, names=age_classes_data['age'].index,
             title='Age Classes Distribution')
fig.show()

The majority of patients are adults.

In [None]:
fig = px.histogram(new_data, x='charges', 
                   color='bmi_class', 
                   marginal='box', 
                   title='Charges Distribution Over BMI Classes')
fig.show()

We can notice that the 3 Obesity Classes have higher values of charges.

In [None]:
fig = px.histogram(new_data, x='charges', 
                   color='age_class', 
                   marginal='box', 
                   title='Charges Distribution Over Age Classes')
fig.show()

The previous plot show that old patients have more medical charges than adult and young patients.

# Regression: XGBOOST

## Label Encoding

In [None]:
encoder = LabelEncoder()
new_data['bmi_class'] = encoder.fit_transform(new_data['bmi_class'])
new_data['age_class'] = encoder.fit_transform(new_data['age_class'])
new_data['smoker'] = encoder.fit_transform(new_data['smoker'])
new_data['region'] = encoder.fit_transform(new_data['region'])
new_data['sex'] = encoder.fit_transform(new_data['sex'])

## Log Transformation 

As we clearly saw in the data analysis, the charges distribution is skewed. For this reason we'll apply a log transformation and train our model(s) on the new target values and retransform predictions later.

In [None]:
new_data['log_charges'] = np.log(new_data['charges'])

In [None]:
figure = plt.figure(figsize=(15,5))
ax1 = figure.add_subplot(2,1,1)
g1 = sns.distplot(new_data['charges'],ax=ax1)
ax2 = figure.add_subplot(2,1,2)
g2 = sns.distplot(new_data['log_charges'],ax=ax2)
figure.suptitle('Log Transformation of Charges Distribution', fontsize=15)
figure.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()

We can visualize the transformation result with plotly too.

In [None]:
fig = px.histogram(new_data, 
                   x='charges', 
                   marginal='box', 
                   title='Initial Charges Distribution')
fig.show()


fig = px.histogram(new_data, 
                   x='log_charges', 
                   marginal='box', 
                   title='Log Transformation of Charges Distribution')
fig.show()

## XGBOOST Model Creation

In [None]:
X, y = new_data.drop(columns=['charges', 'log_charges']), new_data['log_charges']
ground_truth = new_data['charges']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

In [None]:
model = xgb.XGBRegressor(n_estimators=200,
                        learning_rate=0.1,
                        max_depth=10,
                        colsample_bytree=0.85,
                        reg_lambda=1)

In [None]:
model.fit(X_train, y_train)

## Features Importances

### Built-in Feature Importance


How the importance is calculated: either “weight”, “gain”, or “cover”

* ”weight” is the number of times a feature appears in a tree (default importance type).

* ”gain” is the average gain of splits which use the feature.

* ”cover” is the average coverage of splits which use the feature where coverage is defined as the number of samples affected by the split.

In [None]:
fig, ax = plt.subplots(figsize=(13,5))
xgb.plot_importance(model, ax=ax, importance_type='cover')
plt.show()

### SHAP: SHapley Additive exPlanations

SHAP is a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. [SHAP Github](https://github.com/slundberg/shap)

In [None]:
importance_explainer = shap.TreeExplainer(model)

In [None]:
shap_values = importance_explainer.shap_values(X_train)

**- Summary Plot**

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")

The features importance barplot osf SHAP indicates that "smoker" is the most important feature with "age", while "bmi_class" is the least important one.
In addition to the previous plot, we can have this more detailed one too:

In [None]:
shap.summary_plot(shap_values, X_train)

- The goal from this density scatter plot is to identify the impact of each feature on the model's output. Features are ordered following the sum of SHAP value magnitudes across all samples.
- X-axis is for the SHAP values (impact on model) and y-axis is for the feature values (red indicates high values while blue indicate low values of the feature).
- From this plot we can notice that high values of "smoker" featue (ones, since it's a binary feature) have an important impact on the model output but for fewer samples in comparison with low values of the same feature affecting more samples.
- We can also observe that "age" feature influences, with an uniform distribution, the model output with both its high and low values.
- As an other example, "bmi" feaature has less importance and only its high values influence significantly the model output.

**- Dependence Plot**

SHAP dependence plots show the effect of a single feature across the whole dataset and are only defined in regions of the input space supported by data.

In [None]:
for feature in X_train.columns:
    shap.dependence_plot(feature, shap_values, X_train)

**- A single Prediction Impact**

In [None]:
shap.force_plot(importance_explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])

- The previous plot shows us 6 features (from a total of 8) each contributing to push the model output from the base value (the average model output over the training dataset we passed) towards 8.72.
- Features shown in red are influencing the model output by pushing the label higher while the ones shown in blue are pushing the value lower.

In [None]:
X_train.iloc[0,:], y_train.iloc[0]

We can visualize the effect of more ore even all the samples at once.

In [None]:
shap.force_plot(importance_explainer.expected_value, shap_values, X_train)

*NB: We can change the parameters (the scroll menu) near the X-Axis and y-axis to have detailed explanations by feature over different samples.*

## Predictions

In [None]:
predictions = model.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(predictions, y_test))
print(f'RMSE: {np.round(rmse, 3)}')

In [None]:
plt.figure(figsize=(20,6))
plt.plot(np.arange(len(predictions)), predictions, label='Predictions')
plt.plot(np.arange(len(y_test)), y_test, color='r', label='Ground Truth')
plt.ylabel('Log Charges')
plt.legend()
plt.show()