# Explaining Model Behaviour and Feature Importance

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap
import joblib

from sklearn.svm import SVC
from time import time
import os
os.chdir("/home/ronja/GDELT_GKG")

import warnings
warnings.filterwarnings('ignore')

In [2]:
# data

dataset_extension = "_mbfc_allbias_extrafeatures"

train = pd.read_csv("data/train{}.csv".format(dataset_extension))
train.set_index("outlet", inplace=True)

val = pd.read_csv("data/val{}.csv".format(dataset_extension))
val.set_index("outlet", inplace=True)

test = pd.read_csv("data/test{}.csv".format(dataset_extension))
test.set_index("outlet", inplace=True)

# split each dataset into X and y's
X_train = train.drop("lean", axis=1)
y_train = train["lean"]

X_val = val.drop("lean", axis=1)
y_val = val["lean"]

X_test = test.drop("lean", axis=1)
y_test = test["lean"]

# combine all sets so we can visualise the explanation for ANY outlet
X = pd.concat([X_train,X_val,X_test])
y = pd.concat([y_train,y_val,y_test])

In [None]:
# get a model
model = joblib.load("results/best_SVC_model.sav")

## Part 1. SHAP

Decision plots: https://shap.readthedocs.io/en/latest/example_notebooks/api_examples/plots/decision_plot.html?highlight=force#Show-a-large-number-of-feature-effects-clearly

Heatmap plots: https://shap.readthedocs.io/en/latest/example_notebooks/api_examples/plots/heatmap.html

Beeswax plots (violin): https://shap.readthedocs.io/en/latest/example_notebooks/api_examples/plots/beeswarm.html

In [None]:
# initialize JavaScript for visualizing the outputs
shap.initjs()

In [None]:
# make explainer object before to make sure this runs faster - on training set though!
explainer = shap.KernelExplainer(model.predict, X_train)

# make function for plotting a single outlet's decision plot
def decision_plot_for_outlet(outlet_name, model, explainer,title):
    classes_dict = {0:"Left", 1:"Left lean",2:"Least biased", 3:"Right lean",4:"Right"}
    print("The outlet is: ", outlet_name)
    print("Political leaning is: ", classes_dict[y.loc[outlet_name]])
    # get predicted leaning
    y_pred = model.predict(X.loc[outlet_name,:].values.reshape(1, -1))[0]
    print("Political leaning predicted as: ", classes_dict[y_pred])
    # explain the outlet in question
    shap_values = explainer.shap_values(X.loc[outlet_name,:])
    # make decision plot of top 20 most influential features
    shap.decision_plot(explainer.expected_value, 
                       shap_values, 
                       X.loc[outlet_name,:],
                       show=0
                      )
    plt.title(title)
    plt.show()

### Get decision plot for a single outlet

In [None]:
# examples to examine
"cnn.com" # test
"foxnews.com" # test 
"breitbart.com" # test
"theguardian.com" # test
"dailymail.co.uk" # test

decision_plot_for_outlet("cnn.com",model=model,explainer=explainer, title="Decision Plot for CNN")

In [None]:
decision_plot_for_outlet("breitbart.com",model=model,explainer=explainer, title="Decision Plot for Breitbart")

In [None]:
decision_plot_for_outlet("theguardian.com",model=model,explainer=explainer, title="Decision Plot for the Guardian")

In [None]:
decision_plot_for_outlet("dailymail.co.uk",model=model,explainer=explainer, title="Decision Plot for Daily Mail")

In [None]:
#decision_plot_for_outlet("foxnews.com",model=model,explainer=explainer, title="Decision Plot for Fox News")

### Get decision plot for a class

In [None]:
class_indexes = {
    "left": np.where(y == 0)[0],
    "left lean": np.where(y == 1)[0],
    "least biased": np.where(y == 2)[0],
    "right lean": np.where(y == 3)[0],
    "right": np.where(y == 4)[0],
}

selection = class_indexes["left lean"][:10] # let's just get the first 10

In [None]:
def explain_multiple_outlets(selection, X, explainer,title=""):
    shap_values = explainer.shap_values(X.iloc[selection,:])
    # make decision plot of top 20 most influential features
    y_pred = model.predict(X.iloc[selection,:])
    misclassified = y_pred != y[selection]
    shap.decision_plot(explainer.expected_value, 
                       shap_values, 
                       X.iloc[selection,:], 
                       highlight=misclassified,
                       show=0
                      )
    plt.title(title)
    plt.show()
    return misclassified

In [None]:
selection = class_indexes["left lean"][:10]
misclassified = explain_multiple_outlets(selection=selection, X=X, 
                                         explainer=explainer,
                                         title="Feature Importance of Left Lean Outlets")


In [None]:
selection = class_indexes["least biased"][:10]
misclassified = explain_multiple_outlets(selection=selection, X=X, 
                                         explainer=explainer,
                                         title="Feature Importance of Least Biased Outlets")


In [None]:
selection = class_indexes["right lean"][:10]
misclassified = explain_multiple_outlets(selection=selection, X=X, 
                                         explainer=explainer,
                                         title="Feature Importance of Right Lean Outlets")


In [None]:
selection = class_indexes["right"][:10]
misclassified = explain_multiple_outlets(selection=selection, X=X, 
                                         explainer=explainer,
                                         title="Feature Importance of Right Outlets")


In [None]:
misclassified[misclassified == True]

In [None]:
# let's plot the one misclassified outlet specifically
misclassified_outlet = "torontosun.com" #misclassified[misclassified == True].index[0]
decision_plot_for_outlet(misclassified_outlet,model=model,explainer=explainer,title="Misclassidied")

### Make plot for all classes
https://github.com/Rakeshsuku/Medium-Blog/blob/master/Kernel_SHAP.ipynb

In [None]:
import warnings
warnings.filterwarnings('ignore')

class_indexes = {
    "left": np.where(y == 0)[0],
    "left lean": np.where(y == 1)[0],
    "least biased": np.where(y == 2)[0],
    "right lean": np.where(y == 3)[0],
    "right": np.where(y == 4)[0],
}

# visualize decision plots per class
small_selection_of_all_classes = np.concatenate([class_indexes["left"][:5],class_indexes["left lean"][:5],
                                                 class_indexes["least biased"][:5],class_indexes["right lean"][:5],
                                                 class_indexes["right"][:5]])

class_explainer = shap.KernelExplainer(model.predict_proba, data=X_train)
# get shap values for the subset
shap_values = class_explainer.shap_values(X.iloc[small_selection_of_all_classes])

In [None]:
# get shap plot for all classes
shap.summary_plot(shap_values = shap_values,
                  features = X.iloc[small_selection_of_all_classes],
                  class_names = ["Left","L-lean","Center","R-lean","Right"],
                  max_display = 10,
                  plot_size = 0.3,
                  show=0
                  )
plt.title("Feature Impacts per Political Bias Class")
plt.xlim(0,0.15)
plt.show()

In [None]:
# get shap plot for all classes
shap.summary_plot(shap_values = shap_values,
                  features = X.iloc[small_selection_of_all_classes],
                  class_names = ["Left","L-lean","Center","R-lean","Right"],
                  max_display = 10,
                  plot_size = 0.3,
                  plot_type = "violin",
                  show=0
                  )
plt.title("Feature Impacts per Political Bias Class")
plt.xlim(0,0.15)
plt.show()

## Explainability with Feature Permutation

In [None]:
from sklearn.inspection import permutation_importance

#calculate permutation importance for test data 
result_test = permutation_importance(
    clf, X_test, y_test, n_repeats=20, random_state=42, n_jobs=2
)
# get sorted importances for visualising it in descending order
sorted_importances_idx_test = result_test.importances_mean.argsort()

importances_test = pd.DataFrame(
    result_test.importances[sorted_importances_idx_test].T,
    columns=X_train.columns[sorted_importances_idx_test],
)

# plot it from DF
plt.figure()
importances_test.iloc[:,:10].plot.box(vert=False, whis=10)
plt.title("Permutation Importances (test set)")
plt.axvline(x=0, color="k", linestyle="--")
plt.xlabel("Decrease in accuracy score")