In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
import altair as alt
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

sys.path.insert(0, '../src/visualization/')
import visualize as vis

In [None]:
df = pd.read_csv('../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv')

## XGBoost Model to see which features most impact enrollment.

In [None]:
X = df.drop(columns=['Enrolled','Admission_status','Unnamed: 0']).select_dtypes([float,bool,int]).fillna(-999)
Y = df['Enrolled'].fillna(-999)

In [None]:
# fit model no training data
model = xgb.XGBClassifier()
model.fit(X, Y)
# feature importance
print(model.feature_importances_)

## Plot feature importance

In [None]:
vis.my_plot_importance(model,figsize=(6,10));

## Plot shapley values.

In [None]:
shap.initjs()
# explain the model's predictions using SHAP values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [None]:
shap.summary_plot(shap_values[0], X, max_display=10,
                  plot_type='dot')

## Plot this as a bar chart.

In [None]:
shap.summary_plot(shap_values[0], X, max_display=10,
                  plot_type='bar')

## Individual features.

#### X axis is feature value
#### Y axis is the associated shapley value (ouput impact)

#### Red/Blue is a value of potential interaction effect

In [None]:
for feature in X.columns.values.tolist()[:5]:
    print(feature)
    shap.dependence_plot(feature, shap_values[0], X)