In [None]:
%run models.ipynb
%run datasets.ipynb

In [None]:
import dalex as dx
import numpy as np
import shap
import time
import signal
from matplotlib import pyplot as plt

In [None]:
svm = create_model('svm', kernel='linear')
xgboost = create_model('xgboost')

## Porównanie zbieżności kernel shap i unbiased kernel shap

In [None]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = get_test_train('house_price', train_size=200)
X_train_xgboost, X_test_xgboost, y_train_xgboost, y_test_xgboost = get_test_train('house_price', train_size=500)

In [None]:
svm.fit(X_train_svm, y_train_svm)
svm.score(X_test_svm, y_test_svm)

In [None]:
xgboost.fit(X_train_xgboost, y_train_xgboost)
xgboost.score(X_test_xgboost, y_test_xgboost)

In [None]:
model_predicting_method = lambda m, d: m.predict(d)

In [None]:
explainer_svm = dx.Explainer(svm, X_train_svm, y_train_svm, predict_function=model_predicting_method, \
                             label="SVM_house_price")
explainer_xgboost = dx.Explainer(xgboost, X_train_xgboost, y_train_xgboost, \
                                 predict_function=model_predicting_method, label="XGBoost_house_price")

kernel_explainer_svm = shap.KernelExplainer(svm.predict, X_train_svm)
kernel_explainer_xgboost = shap.KernelExplainer(xgboost.predict, X_train_xgboost)

In [None]:
nb_of_features = len(X_test_svm.columns)
nb_of_test_samples = 20

N = 500

def get_explanatinos(explainer, contributions, n_samples, model, is_dalex = True):
    
    if model == "svm":
        X_test = X_test_svm
    else:
        X_test = X_test_xgboost
    
    if is_dalex:
    
        for i in range(0, N):
            for k in range(0, nb_of_test_samples):
                print(i,k)

                explain = explainer.predict_parts(X_test.iloc[k:k+1], type="unbiased_kernel_shap", \
                                                  n_samples = n_samples)

                contr = explain.result['contribution'].to_numpy()
                contributions[i, k, :] = contr
                    
    else:
        
        for i in range(0, N):
            for k in range(0, nb_of_test_samples):
                print(i,k)

                shapley_values = explainer.shap_values(X_test.iloc[k:k+1], nsamples = n_samples)

                contr = shapley_values
                contributions[i, k, :] = contr

In [None]:
contributions_xgboost = np.zeros((N, nb_of_test_samples, nb_of_features))
get_explanatinos(explainer_xgboost, contributions_xgboost, 380, "xgboost")

In [None]:
kernel_contributions_xgboost = np.zeros((N, nb_of_test_samples, nb_of_features))
get_explanatinos(kernel_explainer_xgboost, kernel_contributions_xgboost,  20, "xgboost", False)

In [None]:
contributions_svm = np.zeros((N, nb_of_test_samples, nb_of_features))
get_explanatinos(explainer_svm, contributions_svm, 165, "svm")

In [None]:
kernel_contributions_svm = np.zeros((N, nb_of_test_samples, nb_of_features))
get_explanatinos(kernel_explainer_svm, kernel_contributions_svm, 50, "svm", False)

In [None]:
std_svm = np.std(contributions_svm, axis = 0)
std_svm_mean = np.mean(std_svm, axis = 0)
std_svm_mean

In [None]:
std_svm_kernel = np.std(kernel_contributions_svm, axis = 0)
std_svm_kernel_mean = np.mean(std_svm_kernel, axis = 0)
std_svm_kernel_mean

In [None]:
std_xgboost = np.std(contributions_xgboost, axis = 0)
std_xgboost_mean = np.mean(std_xgboost, axis = 0)
std_xgboost_mean

In [None]:
std_xgboost_kernel = np.std(kernel_contributions_xgboost, axis = 0)
std_xgboost_kernel_mean = np.mean(std_xgboost_kernel, axis = 0)
std_xgboost_kernel_mean

In [None]:
fig, ax = plt.subplots(figsize=(30,10))

x_axis = np.arange(9)
width = 0.4
plt.bar(x_axis - width / 2, std_xgboost_mean, width, color='tab:red', label='unbiased kenrel shap')
plt.bar(x_axis + width / 2, std_xgboost_kernel_mean, width, color='tab:blue', label='kernel shap')
plt.title('Mean standard deviation of estmated shapley values for xgboost per feature for 9 feature regression', fontsize=30)
plt.xlabel(None)
plt.yticks(fontsize=20)
plt.xticks(x_axis, X_test_svm.columns, fontsize=20)
ax.grid(False)
ax.tick_params(bottom=False, left=True)
plt.legend(frameon=False, fontsize=20)

In [None]:
def save_parquet(array, name):
    df = pd.DataFrame(columns=list(X_test_xgboost.columns) + ['N', 'test_sample_idx'])

    for N, a in enumerate(array):
        for test_sample_idx, c in enumerate(a):
            row = np.concatenate((c, np.array([N,test_sample_idx])))
            temp = pd.DataFrame(row.reshape(1,-1), columns=(list(X_test_xgboost.columns) + ['N', 'test_sample_idx']))
            df = df.append(temp, ignore_index=True)
    df.to_parquet(name)

In [None]:
save_parquet(kernel_contributions_svm, "../estimates/house_price_kernel_svm_compare.parquet")
save_parquet(contributions_svm, "../estimates/house_price_uks_svm_compare.parquet")
save_parquet(kernel_contributions_xgboost, "../estimates/house_price_kernel_xgboost_compare.parquet")
save_parquet(contributions_xgboost, "../estimates/house_price_uks_xgboost_compare.parquet")