# Resources

1. [Explainable AI (XAI) with SHAP -Multi-Class Classification Problem](https://towardsdatascience.com/explainable-ai-xai-with-shap-multi-class-classification-problem-64dd30f97cea)
2. [Nested Kfold](https://machinelearningmastery.com/nested-cross-validation-for-machine-learning-with-python/)

In [None]:
import numpy as np
import pandas as pd
import shap
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.utils import resample

shap.initjs()

# Shap and binary_classification

In [None]:
 data = load_breast_cancer()
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = pd.Series(data['target'])

In [None]:
X.columns

In [None]:
y.value_counts()

In [None]:
y=y.replace(0, 'class_0')

In [None]:
y=y.replace(1, 'class_1')

In [None]:
y.value_counts()

In [None]:
rf = RandomForestClassifier(n_estimators=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)
rf.fit(X_train, y_train)

In [None]:
# explain the model's predictions using SHAP
explainer = shap.Explainer(rf,X_train)
shap_values = explainer.shap_values(X_train)

## SHAP Summary Plot

In [None]:
# visualize the general contribution.
shap.summary_plot(shap_values, X.values, max_display=30,plot_type="bar", feature_names = X.columns)

In [None]:
#class_1
shap.summary_plot(shap_values[0],X_train.values, feature_names = X.columns,max_display=30)

In [None]:
#class_0
shap.summary_plot(shap_values[1], X_train.values, feature_names = X.columns,max_display=30)

## SHAP Dependence Plot

In [None]:
shap.dependence_plot(0,shap_values[0], X_train.values, interaction_index="worst perimeter",feature_names=X.columns)

In [None]:
shap.dependence_plot(0,shap_values[1], X_train.values,  interaction_index="worst perimeter", feature_names=X.columns)

## SHAP Force plot

In [None]:
i=1
shap.force_plot(explainer.expected_value[0], shap_values[0][i], X_train.values[i], feature_names = X.columns)

## SHAP waterfall plot

In [None]:
row = 200
shap.waterfall_plot(shap.Explanation(values=shap_values[1][row], 
                                              base_values=explainer.expected_value[1], data=X_train.iloc[row],  
                                         feature_names=X_train.columns.tolist()))

# Nested Kfold

In [None]:
help(make_classification)

In [None]:
help(RandomizedSearchCV)

In [None]:
X, y = make_classification(n_samples=1000, n_classes=2,n_features=20, random_state=1, n_informative=10, n_redundant=10)
x_frame=pd.DataFrame(X)
y_frame=pd.DataFrame(y)

In [None]:
y_frame.value_counts()

In [None]:
x_frame.columns

In [None]:
x_frame

In [None]:
cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [None]:
outer_results = list()
for train_ix, test_ix in cv_outer.split(x_frame,y_frame):
    # split data
    X_train, X_test = x_frame.iloc[train_ix, :], x_frame.iloc[test_ix, :]
    y_train, y_test = y_frame.iloc[train_ix], y_frame.iloc[test_ix]
    model = RandomForestClassifier(random_state=1)
    space = dict()
    space['n_estimators'] = [10, 100, 500]
    space['max_features'] = [2, 4, 6,8]
    search = RandomizedSearchCV(model, space, scoring='f1', cv=cv_inner, refit=True) #Refit on the whole training data after the end.
    result = search.fit(X_train, y_train.values.ravel())
    best_model = result.best_estimator_
    yhat = best_model.predict(X_test)
    f1score=f1_score(y_test,yhat)
    outer_results.append(f1score)
    print('inner test: f1_score=%.3f, est=%.3f, cfg=%s' % (f1score, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('outer test: f1-score mean: %.3f (std: %.3f)' % (np.mean(outer_results), np.std(outer_results)))
    

# Bootstrapping Splitting

In [None]:
Boot_splits=[]
for i in range(0,10):
    Index=range(0,len(x_frame))
    X_inbag, Y_inbag = resample(x_frame, y_frame, replace=True, n_samples=len(Index), random_state=i)

    #find the row index that is not in the in bag to put in out bag
    out_index = [y for y in x_frame.index if y not in X_inbag.index]

    X_outbag = x_frame.iloc[out_index]
    Y_outbag = y_frame.iloc[out_index]

    #add them all to one list
    Boot_splits.append([X_inbag, Y_inbag, X_outbag, Y_outbag])

In [None]:
Boot_splits[0][0]

In [None]:
Boot_splits[0][1]

In [None]:
SKF = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [None]:
outer_results = list()
for i in range(len(Boot_splits)):
    print('Iteration --> '+str(i))
    model = RandomForestClassifier(random_state=1)
    space = dict()
    space['n_estimators'] = [10, 100, 500]
    space['max_features'] = [2, 4, 6,8]
    search = RandomizedSearchCV(model, space, scoring='f1', cv=SKF, refit=True) #Refit on the whole training data after the end.
    result = search.fit(Boot_splits[i][0], Boot_splits[i][1].values.ravel())
    best_model = result.best_estimator_
    yhat = best_model.predict(Boot_splits[i][2])
    f1score=f1_score(Boot_splits[i][3],yhat)
    outer_results.append(f1score)
    print('each bootstrap : f1_score=%.3f, est=%.3f, cfg=%s' % (f1score, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('average bootstraps: f1-score mean: %.3f (std: %.3f)' % (np.mean(outer_results), np.std(outer_results)))
   
    

# [XGBoost hyper-parameters](https://xgboost.readthedocs.io/en/latest/parameter.html)

## [Example 1](https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663)
## [Example 2](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)

In [None]:
"""XGBoost_params = { 'max_depth': [3,6,10],
                  'learning_rate': [0.01, 0.05, 0.1],
                  'n_estimators': [100, 500, 1000],
                  'colsample_bytree': [0.3, 0.7],
                  'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
                  'reg_lambda':[0, 0.001, 0.005, 0.01, 0.05]}
""""

In [None]:
#Example Using Nested XGBoost
for train_ix, test_ix in cv_outer.split(x_frame,y_frame):
    # split data
    X_train, X_test = x_frame.iloc[train_ix, :], x_frame.iloc[test_ix, :]
    y_train, y_test = y_frame.iloc[train_ix], y_frame.iloc[test_ix]
    model = XGBClassifier(random_state=1,objective='binary:hinge',use_label_encoder =False)
    space = XGBoost_params
    search = RandomizedSearchCV(model, space, scoring='f1', cv=cv_inner, refit=True) #Refit on the whole training data after the end.
    result = search.fit(X_train, y_train.values.ravel())
    best_model = result.best_estimator_
    yhat = best_model.predict(X_test)
    f1score=f1_score(y_test,yhat)
    outer_results.append(f1score)
    print('inner test: f1_score=%.3f, est=%.3f, cfg=%s' % (f1score, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('outer test: f1-score mean: %.3f (std: %.3f)' % (np.mean(outer_results), np.std(outer_results)))
    