# Import libraries

In [None]:
# Data wrangling and data visualisation 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from pdpbox import pdp, get_dataset, info_plots
import shap

# Import and read data 

In [None]:
data = pd.read_csv("../input/insurance/insurance.csv")
data.head()

In [None]:
# (rows, columns)
print("Data shape: ", data.shape)

In [None]:
# No missing values 
data.isnull().sum()

# Feature Importance

In [None]:
# Get dummies
data['sex'] = data['sex'].map({'female': 1, 'male': 0})
data['smoker'] = data['smoker'].map({'yes': 1, 'no': 0})
data = pd.get_dummies(data)

# Predictor and target variables
X = data.drop('charges', axis = 1)
y = data.charges

# Train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")

# Fit model 
rf = RandomForestRegressor(random_state = 42).fit(X_train, y_train)
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis = 0)
indices = np.argsort(importances)[::-1]

In [None]:
for f in range(X_train.shape[1]):
    print(f"{f+1}. {X_train.columns[indices[f]]} ({round(importances[indices[f]], 3)})")

In [None]:
# Plot feature importances
plt.figure(figsize = (10, 5))
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], yerr = std[indices])
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation = 90)
plt.show()

# Permutation Importance

In [None]:
perm = PermutationImportance(rf, random_state = 42).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

# Partial Dependence Plots

In [None]:
# Create data to plot
pdp_bmi = pdp.pdp_isolate(model = rf, dataset = X_val, model_features = X_train.columns, feature = 'bmi')

# Plot data
pdp.pdp_plot(pdp_bmi, 'bmi')
plt.show()

In [None]:
# Create data to plot
pdp_age = pdp.pdp_isolate(model = rf, dataset = X_val, model_features = X_train.columns, feature = 'age')

# Plot data
pdp.pdp_plot(pdp_age, 'age')
plt.show()

# 2D Partial Dependence Plots

In [None]:
features_to_plot = ['bmi', 'age']
inter = pdp.pdp_interact(model = rf, dataset = X_val, model_features = X_train.columns, features = features_to_plot)
pdp.pdp_interact_plot(pdp_interact_out = inter, feature_names = X_train.columns, plot_type = 'contour')
plt.show()

# SHAP Values

In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_val)
shap.summary_plot(shap_values, X_val)

In [None]:
shap.dependence_plot('bmi', shap_values, X_val, interaction_index = 'age')

In [None]:
shap.initjs()
def shap_plot(index):
    explainer = shap.TreeExplainer(rf)
    shap_values = explainer.shap_values(X_val)
    res = shap.force_plot(explainer.expected_value, shap_values[index], X_val.iloc[index])
    return res

In [None]:
X_val[:3]

In [None]:
shap_plot(0)

In [None]:
shap_plot(1)

In [None]:
shap_plot(2)