# Graduates Admission - Explainability

In this notebook, we will try to explain the model itelf as well as predictions given by the model.


## Imports

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns


## Global Explanations

As we used a linear regression model, plotting its feature importance is very easy.

#### Load data

In [None]:
data = pd.read_csv('./data/admission_data-v2.csv')
data.head()


### Linear Regression model from Model Registry

In [None]:
import mlflow

import warnings
warnings.filterwarnings("ignore")


mlflow.set_tracking_uri('http://localhost:5000')
model_name = 'Graduate Admission Predictor'
version = 1
stage = 'production'

# Load model
model_uri = f'models:/{model_name}/{stage}'
model_from_registry = mlflow.sklearn.load_model(model_uri)
model_from_registry


#### Feature importance

In [None]:
coefficients = model_from_registry[-1].coef_
plt.figure(figsize=(10,5))
plt.bar(data.drop(columns=['Chance of Admit']).columns, coefficients)
plt.xticks(rotation=45)
plt.show()


### Random Forest model

#### Train the model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

data_for_shap = data.copy()
data_for_shap['Gender'].replace({'M': 1, 'F': 0}, inplace=True)
X = data_for_shap.drop(columns=['Chance of Admit'])
y = data_for_shap['Chance of Admit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
rf = RandomForestRegressor().fit(X_train, y_train)
print(f"Root mean squared test error = {np.sqrt(np.mean((rf.predict(X_test) - y_test)**2))}")


#### Get global explanation

In [None]:
import shap

shap.initjs()
X_train_summary = shap.kmeans(X_train, 10)
rf_ex = shap.KernelExplainer(rf.predict, X_train[0:50])
rf_shap_values = rf_ex.shap_values(X_test)
shap.summary_plot(rf_shap_values, X_test)


### Deep learning model

Install tensorflow by running following command on the conda prompt:

*pip install tensorflow*

Restart kernel before running following cell.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError as mse_loss
from tensorflow.keras.metrics import MeanSquaredError as mse_metric

import warnings
warnings.filterwarnings('ignore')

...
# define the keras model
keras_model = Sequential()
keras_model.add(Dense(12, input_shape=(X_train.shape[1],), activation='relu'))
keras_model.add(Dense(10, activation='relu'))
keras_model.add(Dense(8, activation='relu'))
keras_model.add(Dense(6, activation='relu'))
keras_model.add(Dense(1, activation='linear'))
keras_model.compile(loss=mse_loss(), optimizer='adam', metrics=[mse_metric()])
keras_model.fit(X_train, y_train, epochs=10, batch_size=10)


In [None]:
import shap

shap.initjs()
X_train_summary = shap.kmeans(X_train, 10)
ex = shap.KernelExplainer(keras_model, X_train_summary)
shap_values = ex.shap_values(X_test)
shap.summary_plot(shap_values[0], plot_type='bar', feature_names=X_train.columns)


## Local explanation with LIME

In [None]:
import lime
from lime.lime_tabular import LimeTabularExplainer
import warnings

data_for_lime = data.copy()
data_for_lime['Gender'].replace({'M': 1, 'F': 0}, inplace=True)
X = data_for_lime.drop(columns=['Chance of Admit'])
y = data_for_lime['Chance of Admit']

# Categorical column
gender_feature_index = X.columns.get_loc('Gender')
gender_value_mapping = {0: 'F', 1: 'M'}
feature_names = list(X.columns.values)

_le = lime.lime_tabular.LimeTabularExplainer(X.values,
                                             mode='regression',
                                             feature_names=feature_names,
                                             categorical_features=[gender_feature_index],
                                             categorical_names={gender_feature_index: gender_value_mapping},
                                             class_names=['Chance of Admit'],
                                             kernel_width=3,
                                             feature_selection='highest_weights')



testing_data = pd.read_csv('./data/admission_data-production.csv')
testing_data['Gender'].replace({'F': 0, 'M': 1}, inplace=True)
for _, row in testing_data[-2:].iterrows():
    exp = _le.explain_instance(row, rf.predict, num_features=3)
    print('-' * 100)
    print(row)
    print(f'Chance of Admit: {rf.predict([row.values])[0]}')
    print(exp.as_list())


## Explainability using alibi

Let's use Random Forest model that we have created just now.

Install tensorflow by running following command on the conda prompt:

*pip install alibi[tensorflow]==0.7.0*

**Note**: This command downloads libraries of size ~ 540 MB and takes significant time to get installed. 

Restart kernel before running following cell.

#### Accumulated Local Effects

In [None]:
from alibi.explainers import ALE, plot_ale

ale = ALE(rf.predict, feature_names=list(X.columns.values))
exp = ale.explain(X.values)
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(nrows=4, ncols=2, figsize=(15, 14))
axes = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8]
for i, f in enumerate(list(X.columns.values)):
    plot_ale(exp, features=[f], line_kw={'label': 'Chance of Admit'}, ax=axes[i])
    axes[i].set_title(f'ALE plot for feature - {f}')
plt.tight_layout(h_pad=4.0, w_pad=4.0)
plt.show()