# Plotting Results
This notebook contains the code used to generate plots for the technical report, including feature importance results and visualizations of model performance. 

In [2]:
import pandas as pd
import joblib
import altair as alt
import glob
import sys
import os
import numpy as np

sys.path.append(os.path.abspath(".."))
from src.models.feat_selection import ImportanceFeatureSelector

  from .autonotebook import tqdm as notebook_tqdm


## Feature Importance

### Permutation Importance

In [3]:
perm_plots = []
model_order=['Logistic Regression','Random Forest','Gradient Boosting']
for threshold in [50,60,70,80]:   
    gbm_perm = joblib.load(f'../models/{threshold}/fitted_gradient_boosting_permute.joblib')
    rf_perm = joblib.load(f'../models/{threshold}/fitted_random_forest_permute.joblib')
    lr_perm = joblib.load(f'../models/{threshold}/fitted_logistic_regression_permute.joblib')

    permutation_df = pd.DataFrame({
        'mean':np.concatenate((
            gbm_perm.values['importances_mean'],
            rf_perm.values['importances_mean'],
            lr_perm.values['importances_mean'])),
        'std':np.concatenate((
            gbm_perm.values['importances_std'],
            rf_perm.values['importances_std'],
            lr_perm.values['importances_std']
            ))/np.sqrt(5),
        'model':['Gradient Boosting']*15 + ['Random Forest']*15 + ['Logistic Regression']*15,
        'feature':gbm_perm.plot_data.index.to_list()*3
    })
    permutation_df['ci_lower'] = permutation_df['mean'] - 1.96*permutation_df['std']
    permutation_df['ci_upper'] = permutation_df['mean'] + 1.96*permutation_df['std']

    feat_order = gbm_perm.plot_data.index.to_list()

    bars = alt.Chart(permutation_df).mark_bar().encode(
        x=alt.X('feature:N',sort=feat_order, title='',axis=alt.Axis(labelAngle=45)),
        xOffset=alt.XOffset('model:N',sort=model_order),
        y=alt.Y('mean:Q', title='',scale=alt.Scale(domain=[None,0.25])),
        color=alt.Color('model:N',title='Model',sort=model_order),
    )

    error_bars = alt.Chart(permutation_df).mark_rule().encode(
        x=alt.X('feature:N',sort=feat_order),
        xOffset=alt.XOffset('model:N',sort=model_order),
        y='ci_lower:Q',
        y2='ci_upper:Q',
        color=alt.value('black')
    )

    cap_top = alt.Chart(permutation_df).mark_tick(
        color='black',
        thickness=1,
        width=6
    ).encode(
        x=alt.X('feature:N', sort=feat_order),
        xOffset=alt.XOffset('model:N',sort=model_order),
        y='ci_upper:Q'
    )

    # Bottom caps
    cap_bottom = alt.Chart(permutation_df).mark_tick(
        color='black',
        thickness=1,
        width=6
    ).encode(
        x=alt.X('feature:N', sort=feat_order),
        xOffset=alt.XOffset('model:N',sort=model_order),
        y='ci_lower:Q'
    )


    permutation_plot = (bars + error_bars + cap_bottom + cap_top).properties(
        title=f'{threshold}%',
        width=450
        )
    perm_plots.append(permutation_plot)
    
title = alt.Chart({'values':[{}]}).mark_text(
    text='Permutation Feature Importance',
    fontSize=20,
    font='Helvetica',
    dy=-10
).encode().properties(
    width=225,
    height=2
)

# Remove x-axis tick labels but keep ticks and gridlines
perm_plots[0] = perm_plots[0].encode(
    x=alt.X('feature:N', axis=alt.Axis(labels=False, title=None))
)
perm_plots[1] = perm_plots[1].encode(
    x=alt.X('feature:N', axis=alt.Axis(labels=False, title=None))
)

# Remove y-axis tick labels but keep ticks and gridlines
perm_plots[1] = perm_plots[1].encode(
    y=alt.Y('mean:Q', axis=alt.Axis(labels=False, title=None))
)
perm_plots[3] = perm_plots[3].encode(
    y=alt.Y('mean:Q', axis=alt.Axis(labels=False, title=None))
)

perm_importance_plot = alt.vconcat(title, (perm_plots[0] & perm_plots[2]) | (perm_plots[1] & perm_plots[3])).configure_view(stroke=None)

perm_importance_plot

In [4]:
perm_importance_plot.save('../img/feature_importance_permutation.png',ppi=500)

### SHAP Importance

In [5]:
shap_plots = []
model_order=['Logistic Regression','Random Forest','Gradient Boosting']
for threshold in [50,60,70,80]:   
    gbm_shap = joblib.load(f'../models/{threshold}/fitted_gradient_boosting_shap.joblib')
    rf_shap = joblib.load(f'../models/{threshold}/fitted_random_forest_shap.joblib')
    lr_shap = joblib.load(f'../models/{threshold}/fitted_logistic_regression_shap.joblib')

    shap_df = pd.DataFrame({
        'mean':np.concatenate((
            gbm_shap.plot_data,
            rf_shap.plot_data,
            lr_shap.plot_data)),
        'model':['Gradient Boosting']*15 + ['Random Forest']*15 + ['Logistic Regression']*15,
        'feature':gbm_shap.plot_data.index.to_list()*3
    })

    feat_order = gbm_shap.plot_data.index.to_list()

    bars = alt.Chart(shap_df).mark_bar().encode(
        x=alt.X('feature:N',sort=feat_order, title='',axis=alt.Axis(labelAngle=45)),
        xOffset=alt.XOffset('model:N',sort=model_order),
        y=alt.Y('mean:Q', title=''),
        color=alt.Color('model:N',title='Model',sort=model_order),
    ).properties(
        title=f'{threshold}%',
        width=450
        )
    shap_plots.append(bars)




# Remove x-axis tick labels but keep ticks and gridlines
shap_plots[0] = shap_plots[0].encode(
    x=alt.X('feature:N', axis=alt.Axis(labels=False, title=None))
)
shap_plots[1] = shap_plots[1].encode(
    x=alt.X('feature:N', axis=alt.Axis(labels=False, title=None))
)

# Remove y-axis tick labels but keep ticks and gridlines
shap_plots[1] = shap_plots[1].encode(
    y=alt.Y('mean:Q', axis=alt.Axis(labels=False, title=None))
)
shap_plots[3] = shap_plots[3].encode(
    y=alt.Y('mean:Q', axis=alt.Axis(labels=False, title=None))
)

title = alt.Chart({'values':[{}]}).mark_text(
    text='SHAP Feature Importance',
    fontSize=20,
    font='Helvetica',
    dy=-10
).encode().properties(
    width=235,
    height=2
)


shap_importance_plot = alt.vconcat(title,(shap_plots[0] & shap_plots[2]) | (shap_plots[1] & shap_plots[3]))
shap_importance_plot


In [6]:
shap_importance_plot.save('../img/feature_importance_shap.png',ppi=500)

### RFE Cross-Validation Rankings

In [7]:
threshold = 70

gbm_rfe = joblib.load(f'../models/{threshold}/fitted_gradient_boosting_rfecv.joblib')
rf_rfe = joblib.load(f'../models/{threshold}/fitted_random_forest_rfecv.joblib')
lr_rfe = joblib.load(f'../models/{threshold}/fitted_logistic_regression_rfecv.joblib')