In [5]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import altair as alt

import statsmodels.stats.api as sms
from scipy.stats import shapiro

import warnings
warnings.filterwarnings('ignore')

In [3]:
feature_names = ['Jaccard', 'Common Neighbours', 'Cosine Similarity', 'Shortest Path', 'Max Edge Centrality']

## Infomap Final Test

Here we explore the results from the final test of the methodology before the real experiments. First, we look at plots of the entropies and the stability cutoff for each graph.

Then, we look at the permutation importances for each node feature.

In [11]:
for mu in [2]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Infomap/Results/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', accuracy)
    print('Random forest balanced accuracy for each graph:', balanced_accuracy)

Random forest accuracy for each graph: [0.9992799625468165, 0.9990218360071301, 0.9986760070052542, 0.999282196969697, 0.9986202938475669]
Random forest balanced accuracy for each graph: [0.9992799625468165, 0.9990218360071301, 0.9986760070052542, 0.999282196969697, 0.9986202938475669]


In [6]:
shapiro_test = {}
for i, met in enumerate(mu_feature_importances['Metric']):
    if met in shapiro_test.keys():
        shapiro_test[met].append(mu_feature_importances['Value'][i])
    else:
        shapiro_test[met] = [mu_feature_importances['Value'][i]]
for met, val_list in shapiro_test.items():
    test_result = shapiro(val_list)
    print(met + ': Statistic =', test_result.statistic, ' P_Value =', test_result.pvalue)
    print('')

Common Neighbours: Statistic = 0.6841937303543091  P_Value = 0.006496739108115435

Jaccard: Statistic = 0.9660515785217285  P_Value = 0.8493583798408508

Cosine Similarity: Statistic = 0.8034121990203857  P_Value = 0.0863676369190216

Shortest Path: Statistic = 0.9777641892433167  P_Value = 0.9223194122314453

Max Edge Centrality: Statistic = 0.8518232703208923  P_Value = 0.2003505975008011



In [9]:
dataset_sizes = []
for graph in [1, 2, 3, 4, 5]:
    with open('Community_Data/Infomap/Results/mu_0_2/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
        results = pickle.load(f)
    dataset_sizes.append(results['Same Communities'] + results['Different Communities'])

In [10]:
dataset_sizes

[21356, 22438, 22836, 21116, 21780]