In [None]:
!pip install optuna

In [None]:
import shap
import optuna
from sklearn import set_config
from xgboost import XGBRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

# Dataset

In [None]:
set_config(transform_output="pandas")

In [None]:
# load the data

data = fetch_california_housing(as_frame=True)

In [None]:
data.keys()

In [None]:
# make X and y

X, y = data.data, data.target

In [None]:
# columns names

feature_names = data.feature_names

feature_names

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.shape

In [None]:
# split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of Training data: ", X_train.shape)
print("Shape of Testing data: ", X_test.shape)

# Model

In [None]:
# model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42,
                         n_jobs=-1, learning_rate=0.2, n_estimators=200,
                         reg_lambda=50, max_depth=7, gamma=0.05)

In [None]:
# fit the training data
xgb_model.fit(X_train, y_train)

In [None]:
# claculate the predictions
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

# calculate the RMSE and R2 score
rmse_train = root_mean_squared_error(y_train, y_pred_train)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train RMSE: {rmse_train:.4f}, R2: {r2_train:.4f}")
print(f"Test RMSE: {rmse_test:.4f}, R2: {r2_test:.4f}")

# HP Tuning

In [None]:
# tune the model

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 100),
        'gamma': trial.suggest_float('gamma', 0, 2),
        "subsample": trial.suggest_float('subsample', 0.7, 1.0),
        "n_jobs": -1,
        'objective': 'reg:squarederror',
        'random_state': 42
    }

    # set the model with the parameters
    model = XGBRegressor()

    model.set_params(**params)

    # fit the model
    model.fit(X_train, y_train)

    # predict the test data
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

In [None]:
# create a study
study = optuna.create_study(direction='maximize')

study.optimize(func=objective, n_trials=50, show_progress_bar=True, n_jobs=-1)

In [None]:
# get the best parameters
study.best_params

In [None]:
# get the best value
study.best_value

In [None]:
# get the best estimator
best_model = XGBRegressor(**study.best_params)

best_model.fit(X_train, y_train)

In [None]:
# print the metrics
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

rmse_train = root_mean_squared_error(y_train, y_pred_train)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f"Train RMSE: {rmse_train:.4f}, R2: {r2_train:.4f}")
print(f"Test RMSE: {rmse_test:.4f}, R2: {r2_test:.4f}")

# SHAP

In [None]:
best_model

In [None]:
# create the tree explainer

explainer = shap.TreeExplainer(model=best_model,
                               data=X_train)

In [None]:
# expected value
explainer.expected_value

In [None]:
# calculate the shap values for all test data

shap_values = explainer(X_test)

In [None]:
shap_values.shape

In [None]:
len(feature_names)

In [None]:
feature_names

In [None]:
shap_values[0:5]

In [None]:
X_test.head(1).values

# GLOBAL PLOTS

In [None]:
shap_values

In [None]:
shap_values.shape

In [None]:
# bar plot to plot feature importance

shap.plots.bar(shap_values)

In [None]:
shap_values.abs.mean(axis=0)

In [None]:
shap_values.abs.mean(axis=0).values

In [None]:
dict(zip(feature_names,shap_values.abs.mean(axis=0).values))

In [None]:
feature_importances = {feature:shap_val.item() for feature, shap_val in
                    zip(feature_names,shap_values.abs.mean(axis=0).values)}

In [None]:
feature_importances

In [None]:
sorted(feature_importances, key=feature_importances.get, reverse=True)

In [None]:
# heatmap plot

shap.plots.heatmap(shap_values)

In [None]:
# summary plot

shap.plots.beeswarm(shap_values)

In [None]:
# scatter plot (Dependence Plot)
shap.plots.scatter(shap_values[:, "MedInc"])

In [None]:
shap_values[:, "Latitude"]

In [None]:
shap.plots.scatter(shap_values[:, "AveOccup"])

In [None]:
shap.plots.scatter(shap_values[:, "HouseAge"])

In [None]:
# scatter plot with interaction

shap.plots.scatter(shap_values[:, "HouseAge"], color=shap_values)

# LOCAL PLOTS

In [None]:
test_case = X_test.sample(1)

test_case

In [None]:
explained_row = explainer(test_case)

explained_row

In [None]:
explained_row[0,:].shape

In [None]:
explained_row.shape

In [None]:
# waterfall plot

shap.plots.waterfall(explained_row[0])

In [None]:
# force plot

shap.plots.initjs()

shap.plots.force(explained_row[0])

In [None]:
# bar plot for local explaination

shap.plots.bar(explained_row[0])

# LIME

In [None]:
# test case

test_case

In [None]:
# convert test case to a 1D array

test_case_lime = test_case.values.flatten()

In [None]:
!pip install lime

In [None]:
from lime.lime_tabular import LimeTabularExplainer

In [None]:
X_train.values.shape

In [None]:
# feature names

feature_names = X_train.columns.tolist()

In [None]:
# make an lime explainer

lime_explainer = LimeTabularExplainer(
    training_data=X_train.values,
    mode="regression",
    feature_names=feature_names
)

In [None]:
# get the lime explanations

lime_explanation = lime_explainer.explain_instance(
    data_row=test_case_lime,
    predict_fn=xgb_model.predict
)

In [None]:
# lime explanations --> list of values

lime_explanation.as_list()

In [None]:
# lime explanations --> pyplot figure

lime_explanation.as_pyplot_figure()

In [None]:
# lime explanations --> as html

from IPython.core.display import HTML

In [None]:
xgb_model.predict(test_case)

In [None]:
# display explanation

display(HTML(lime_explanation.as_html()))
