In [1]:
%run models.ipynb
%run datasets.ipynb

In [2]:
import dalex as dx
import numpy as np
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
svm = create_model('svm', kernel='linear')
xgboost = create_model('xgboost')

## Szacowanie ceny mieszkań

In [4]:
X_train, X_test, y_train, y_test = get_test_train('house_price', train_size=299+1)

In [None]:
model_predicting_method = lambda m, d: m.predict(d)

In [None]:
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

In [None]:
xgboost.fit(X_train, y_train)
xgboost.score(X_test, y_test)

In [None]:
explainer_svm = dx.Explainer(svm, X_train, y_train, predict_function=model_predicting_method, \
                             label="SVM_house_price")
explainer_xgboost = dx.Explainer(xgboost, X_train, y_train, predict_function=model_predicting_method, \
                                 label="XGBoost_house_price")

In [None]:
N = 500
nb_of_features = len(X_test.columns)

nb_of_columns_sampled = [30, 70, 100, 150, 200, 300, 350, 400, 450, 500]
nb_of_test_samples = 5
contributions_xgboost = np.zeros((len(nb_of_columns_sampled), N, nb_of_test_samples, nb_of_features))
contributions_svm = np.zeros((len(nb_of_columns_sampled), N, nb_of_test_samples, nb_of_features))

for j, samples in enumerate(nb_of_columns_sampled):
    for i in range(0, N):
        for k in range(0, nb_of_test_samples):
            print(j,i,k)
            
            for explainer, contributions in zip([explainer_xgboost, explainer_svm], \
                                                [contributions_xgboost, contributions_svm]):
                explain = explainer.predict_parts(X_test.iloc[k:k+1], type="unbiased_kernel_shap", n_samples = samples)
                
                contr = explain.result['contribution'].to_numpy()
                contributions[j, i, k, :] = contr

In [None]:
avg_xgboost = np.mean(contributions_xgboost, axis = 1)
std_xgboost_0_0 = np.std(contributions_xgboost[:, :, 0, 0], axis = 1)
avg_svm = np.mean(contributions_svm, axis = 1)
std_svm_0_0 = np.std(contributions_svm[:, :, 0, 0], axis = 1)

In [None]:
exact_explainer_svm = shap.explainers.Exact(svm.predict, X_train)
exact_explainer_xgboost = shap.explainers.Exact(xgboost.predict, X_train)

exact_shap_result = []
for explainer in [exact_explainer_svm, exact_explainer_xgboost]:
    shap_values = np.zeros((nb_of_test_samples, nb_of_features))
    for k in range(0, nb_of_test_samples):
        shap_values[k, :] = explainer(X_test.iloc[k:k+1]).values

    exact_shap_result.append(shap_values)

In [None]:
svm_error = np.sum(abs(avg_svm - exact_shap_result[0]), axis=2) / abs(np.sum(exact_shap_result[0], axis=1))
mean_svm_error = np.mean(svm_error, axis=1)
std_svm_error = np.std(svm_error, axis=1)
xgboost_error = np.sum(abs(avg_xgboost - exact_shap_result[1]), axis=2) / abs(np.sum(exact_shap_result[1], axis=1))
mean_xgboost_error = np.mean(xgboost_error, axis=1)
std_xgboost_error = np.std(xgboost_error, axis=1)

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(10,5))

plt.bar(nb_of_columns_sampled, mean_svm_error)
plt.errorbar(nb_of_columns_sampled, mean_svm_error, yerr=std_svm_error, fmt="o")
plt.title('Mean unbiased kernel shap error for xgboost', fontsize=20)
plt.xlabel('Number of n_samples', fontsize=15)
plt.ylabel('Mean realtive error', fontsize=15)
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)
ax.grid(False)
ax.tick_params(bottom=False, left=True)

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(10,5))

plt.scatter(nb_of_columns_sampled, mean_xgboost_error)
plt.errorbar(nb_of_columns_sampled, mean_xgboost_error, yerr=std_xgboost_error, fmt="o")
plt.title('Mean unbiased kernel shap error for SVM ', fontsize=20)
plt.xlabel('Number of n_samples', fontsize=15)
plt.ylabel('Mean realtive error', fontsize=15)
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)
ax.grid(False)
ax.tick_params(bottom=False, left=True)

In [None]:
fig, ax = plt.subplots(figsize=(30,10))

width = 10
plt.bar([x - width/2 for x in nb_of_columns_sampled], std_svm_0_0, width, color='tab:red', label="SVM")
plt.bar([x + width/2 for x in nb_of_columns_sampled], std_xgboost_0_0, width, color='tab:blue', label="xgboost")
plt.title("Standard deviation for random sample for 'bedrooms' input", fontsize=30)
plt.xlabel('Number of n_samples', fontsize=20)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
ax.grid(False)
ax.tick_params(bottom=False, left=True)


plt.legend(frameon=False, fontsize=20)

In [None]:
df_svm = pd.DataFrame(columns=list(X_test.columns) + ['n_samples', 'N', 'test_sample_idx'])

for n_samples_idx, a in enumerate(contributions_svm):
    for N, b in enumerate(a):
        for test_sample_idx, c in enumerate(b):
            row = np.concatenate((c, np.array([n_samples_idx,N,test_sample_idx])))
            temp = pd.DataFrame(row.reshape(1,-1), columns=(list(X_test.columns) + ['n_samples', 'N', 'test_sample_idx']))
            df_svm = df_svm.append(temp, ignore_index=True)

In [None]:
df_xgboost = pd.DataFrame(columns=list(X_test.columns) + ['n_samples', 'N', 'test_sample_idx'])

for n_samples_idx, a in enumerate(contributions_xgboost):
    for N, b in enumerate(a):
        for test_sample_idx, c in enumerate(b):
            row = np.concatenate((c, np.array([n_samples_idx,N,test_sample_idx])))
            temp = pd.DataFrame(row.reshape(1,-1), columns=(list(X_test.columns) + ['n_samples', 'N', 'test_sample_idx']))
            df_xgboost = df_xgboost.append(temp, ignore_index=True)

In [None]:
df_xgboost.to_parquet("../estimates/house_price_xgboost_uks.parquet")

In [None]:
df_svm.to_parquet("../estimates/house_price_svm_uks.parquet")