In [1]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import altair as alt

In [2]:
feature_names = ['Degree', 'Clustering Coefficient', 'Betweenness', 'Closeness', 'Shortest Path', 'Eigenvector', 'E In', 
                 'E Out', 'E In Over E Out', 'ODF', 'Expansion', 'Cut Ratio', 'Conductance', 'Normalised Cut', 
                 'Triangle Participation']

## Infomap Final Test

Here we explore the results from the final test of the methodology before the real experiments. First, we look at plots of the entropies and the stability cutoff for each graph.

Then, we look at the permutation importances for each node feature.

In [3]:
for mu in [2, 3, 4]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Infomap/Results/mu_0_{0}/graph_0{1}_undersample_0_75/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        with open('Community_Data/Infomap/Results_Additional/mu_0_{0}/graph_0{1}_undersample_0_75/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', list(np.around(accuracy, decimals=2)))
    print('')
    print('Random forest balanced accuracy for each graph:', list(np.around(balanced_accuracy, decimals=2)))

Random forest accuracy for each graph: [0.93, 0.73, 0.89, 1.0, 0.96, 1.0, 0.93, 1.0, 0.85, 0.78, 1.0, 0.98, 0.8, 0.99, 0.97, 0.91, 0.86, 0.94, 1.0, 0.93]

Random forest balanced accuracy for each graph: [0.93, 0.69, 0.85, 1.0, 0.96, 1.0, 0.93, 1.0, 0.85, 0.76, 1.0, 0.98, 0.82, 0.99, 0.97, 0.91, 0.86, 0.93, 1.0, 0.92]


Random forest accuracy for each graph: [0.87, 0.96, 0.96, 0.68, 0.91, 0.89, 0.78, 0.99, 0.95, 0.98, 0.93, 0.92, 0.96, 0.94, 0.92, 0.79, 0.97, 0.92, 0.99, 0.88]

Random forest balanced accuracy for each graph: [0.87, 0.96, 0.97, 0.65, 0.93, 0.89, 0.78, 0.99, 0.95, 0.98, 0.91, 0.92, 0.95, 0.93, 0.92, 0.75, 0.96, 0.93, 0.99, 0.9]


Random forest accuracy for each graph: [0.91, 0.92, 0.84, 0.9, 0.89, 0.84, 0.96, 0.9, 0.88, 0.94, 0.9, 0.85, 0.94, 0.93, 0.93, 0.91, 0.86, 0.87, 0.85, 0.92]

Random forest balanced accuracy for each graph: [0.91, 0.91, 0.83, 0.89, 0.88, 0.83, 0.97, 0.9, 0.88, 0.94, 0.9, 0.84, 0.95, 0.91, 0.92, 0.91, 0.86, 0.87, 0.85, 0.92]


## Louvain Final Test

Here we explore the results from the final test of the methodology before the real experiments. First, we look at plots of the entropies and the stability cutoff for each graph.

Then, we look at the permutation importances for each node feature.

In [4]:
for mu in [2, 3, 4]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Louvain/Results/mu_0_{0}/graph_0{1}_undersample_0_75/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        with open('Community_Data/Louvain/Results_Additional/mu_0_{0}/graph_0{1}_undersample_0_75/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', list(np.around(accuracy, decimals=2)))
    print('')
    print('Random forest balanced accuracy for each graph:', list(np.around(balanced_accuracy, decimals=2)))

Random forest accuracy for each graph: [0.98, 0.92, 0.85, 0.85, 0.82, 0.89, 0.93, 1.0, 0.82, 1.0, 0.81, 0.81, 0.85, 1.0, 0.83, 0.95, 0.98, 0.88, 0.84, 1.0]

Random forest balanced accuracy for each graph: [0.98, 0.93, 0.83, 0.84, 0.82, 0.88, 0.93, 1.0, 0.81, 1.0, 0.8, 0.81, 0.84, 1.0, 0.81, 0.95, 0.97, 0.88, 0.83, 1.0]


Random forest accuracy for each graph: [0.78, 0.68, 0.78, 0.73, 0.7, 0.75, 0.88, 0.72, 0.78, 0.86, 0.75, 0.73, 0.74, 0.7, 0.57, 0.8, 0.78, 0.88, 0.87, 0.79]

Random forest balanced accuracy for each graph: [0.76, 0.65, 0.78, 0.7, 0.7, 0.73, 0.86, 0.72, 0.76, 0.85, 0.74, 0.72, 0.74, 0.69, 0.56, 0.81, 0.76, 0.87, 0.87, 0.78]


Random forest accuracy for each graph: [0.96, 0.94, 1.0, 0.87, 0.92, 0.88, 0.84, 0.88, 0.91, 0.91, 0.87, 0.86, 0.91, 0.91, 0.81, 0.91, 0.8, 0.77, 0.81, 0.76]

Random forest balanced accuracy for each graph: [0.95, 0.95, 1.0, 0.87, 0.92, 0.87, 0.83, 0.87, 0.89, 0.93, 0.87, 0.85, 0.89, 0.91, 0.8, 0.89, 0.79, 0.77, 0.81, 0.74]


## LPA Final Test

Here we explore the results from the final test of the methodology before the real experiments. First, we look at plots of the entropies and the stability cutoff for each graph.

Then, we look at the permutation importances for each node feature.

In [5]:
for mu in [2, 3]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/LPA/Results/mu_0_{0}/graph_0{1}_undersample_0_75/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        with open('Community_Data/LPA/Results_Additional/mu_0_{0}/graph_0{1}_undersample_0_75/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', list(np.around(accuracy, decimals=2)))
    print('')
    print('Random forest balanced accuracy for each graph:', list(np.around(balanced_accuracy, decimals=2)))

Random forest accuracy for each graph: [0.91, 0.79, 0.82, 0.9, 0.95, 0.94, 0.82, 0.98, 0.9, 0.85, 0.91, 0.88, 0.74, 0.73, 0.83, 0.88, 0.9, 0.87, 0.88, 0.9]

Random forest balanced accuracy for each graph: [0.92, 0.78, 0.8, 0.88, 0.95, 0.93, 0.81, 0.98, 0.89, 0.84, 0.91, 0.88, 0.74, 0.72, 0.83, 0.88, 0.9, 0.85, 0.87, 0.9]


Random forest accuracy for each graph: [0.88, 0.93, 0.96, 0.81, 0.81, 0.88, 0.76, 0.88, 0.88, 0.89, 0.89, 0.87, 0.98, 0.85, 0.83, 0.92, 0.92, 0.89, 0.81, 0.85]

Random forest balanced accuracy for each graph: [0.87, 0.92, 0.96, 0.8, 0.81, 0.87, 0.76, 0.89, 0.88, 0.88, 0.91, 0.86, 0.98, 0.84, 0.81, 0.91, 0.92, 0.88, 0.8, 0.85]
