In [1]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import altair as alt

import statsmodels.stats.api as sms

I'm going to show results from randomly undersampling the majority class at 3 different levels, as well as the original results (i.e. no undersampling). 

At this point, I just chose one set of experiments to rerun at different undersampling levels - the Infomap algorithm on mu 0.2 graphs (since there was a significant class imbalance for these experiments). There are 5 mu 0.2 graphs, corresponding to 5 experiments. I then show the plots of mean (black), median (red) and confidence intervals for each feature based on these 5 experiments.

The plot titles indicate the proportion of the data which are stable nodes (corresponding to the amount of undersampling). I also printed the mean train accuracy for each of the 5 experiments under each plot.

In [38]:
for us in ['', '_undersample_0_2', '_undersample_0_4', '_undersample_1_0']:
    mu_feature_importances = {'Metric': [], 'Value': []}
    props = []
    accuracy = []
    balanced_accuracy = []
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Infomap/Results/mu_0_2/graph_0{0}{1}/results'.format(graph, us), 'rb') as f:
            results = pickle.load(f)
        props.append(results['Stable Nodes']*100/(results['Stable Nodes'] + results['Unstable Nodes']))
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        perm_importances = results['Feature Importances']
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )

    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    if us == '':
        stab = 'Original (Varies by Graph)'
    else:
        stab = '{0:.2f}%'.format(props[0])
    
    final_plot = (error_bars + mean_points + median_points).properties(
        title=('Proportion Stable: ' + stab)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', accuracy)
    print('Random forest balanced accuracy for each graph:', balanced_accuracy)

Random forest accuracy for each graph: [0.9800374999999999, 0.9553625000000001, 0.9639000000000001, 0.9812374999999999, 0.9596375000000003]
Random forest balanced accuracy for each graph: [0.6243782051282052, 0.7415472972972973, 0.6560952380952381, 0.6249935897435898, 0.8259097222222225]


Random forest accuracy for each graph: [0.8879259259259261, 0.8623157894736841, 0.8691707317073172, 0.9991111111111111, 0.9417291666666667]
Random forest balanced accuracy for each graph: [0.7820909090909093, 0.6857557997557997, 0.6789117647058824, 0.9994545454545454, 0.8975374999999999]


Random forest accuracy for each graph: [0.939875, 0.8658181818181817, 0.9818333333333333, 0.87925, 0.9171071428571431]
Random forest balanced accuracy for each graph: [0.9345636363636364, 0.8137916666666667, 0.9688571428571429, 0.8616727272727274, 0.9058624999999999]


Random forest accuracy for each graph: [0.6775555555555555, 0.644, 0.7710000000000001, 0.8888888888888892, 0.9426875]
Random forest balanced accuracy for each graph: [0.6850000000000002, 0.644, 0.7710000000000001, 0.875, 0.9426875]
