In [17]:
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import altair as alt

import warnings
warnings.filterwarnings('ignore')

In [18]:
feature_names = ['Jaccard', 'Common Neighbours', 'Cosine Similarity', 'Shortest Path', 'Max Edge Centrality']

## Infomap Final Test

For the pair node experiments, I want to use a subset of the data because there are so many pairs. For this reason, I sort all the pairs of nodes by their value in the coassociation matrix, and plan to take the 500 with the highest values as the "same community" data, and the 500 with the lowest values as the "different community" data. Before using this method, I'll check what range the 500 highest and lowest values are in.

In [35]:
for mu in [2, 3, 4]:
    print('~~~~~~~~~~~~~~~~~~~~~~')
    print('Graphs for mu 0.{0} will have entropy values for:'.format(mu))
    print('~~~~~~~~~~~~~~~~~~~~~~')
    for graph in [1, 2, 3, 4, 5]:
        entrops = pd.read_csv('Community_Data/Infomap/Pair_Entropies/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        print('Different Community: Between {0} and {1}'.format(sorted(entropy_values)[0], sorted(entropy_values)[500]))
        print('Same Community: Between {0} and {1}'.format(sorted(entropy_values)[-500], sorted(entropy_values)[-1]))

~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.2 will have entropy values for:
~~~~~~~~~~~~~~~~~~~~~~
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.3 will have entropy values for:
~~~~~~~~~~~~~~~~~~~~~~
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.4 w

Values are always exactly 0 or 1. This means we are only using pairs of nodes which are either always in the same community, or always in different communities.

In [36]:
for mu in [2, 3, 4]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Infomap/Results/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', accuracy)
    print('Random forest balanced accuracy for each graph:', balanced_accuracy)

Random forest accuracy for each graph: [1.0, 0.9937500000000002, 0.999625, 1.0, 1.0]
Random forest balanced accuracy for each graph: [1.0, 0.9937500000000002, 0.999625, 1.0, 1.0]


Random forest accuracy for each graph: [0.9872125, 0.9979750000000001, 0.9941125000000002, 0.9981000000000001, 0.9806374999999998]
Random forest balanced accuracy for each graph: [0.9872125, 0.9979750000000001, 0.9941125000000002, 0.9981000000000001, 0.9806374999999998]


Random forest accuracy for each graph: [0.9558875, 0.9503499999999999, 0.9507249999999999, 0.9320750000000001, 0.916875]
Random forest balanced accuracy for each graph: [0.9558875, 0.9503499999999999, 0.9507249999999999, 0.9320750000000001, 0.916875]


## Louvain Final Test

In [37]:
for mu in [2, 3, 4]:
    print('~~~~~~~~~~~~~~~~~~~~~~')
    print('Graphs for mu 0.{0} will have entropy values for:'.format(mu))
    print('~~~~~~~~~~~~~~~~~~~~~~')
    for graph in [1, 2, 3, 4, 5]:
        entrops = pd.read_csv('Community_Data/Louvain/Pair_Entropies/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        print('Different Community: Between {0} and {1}'.format(sorted(entropy_values)[0], sorted(entropy_values)[500]))
        print('Same Community: Between {0} and {1}'.format(sorted(entropy_values)[-500], sorted(entropy_values)[-1]))

~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.2 will have entropy values for:
~~~~~~~~~~~~~~~~~~~~~~
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.3 will have entropy values for:
~~~~~~~~~~~~~~~~~~~~~~
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.4 w

In [38]:
for mu in [2, 3, 4]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Louvain/Results/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', accuracy)
    print('Random forest balanced accuracy for each graph:', balanced_accuracy)

Random forest accuracy for each graph: [0.9989250000000001, 0.9936875000000003, 0.9875000000000003, 0.9937500000000002, 0.9937250000000003]
Random forest balanced accuracy for each graph: [0.9989250000000001, 0.9936875000000003, 0.9875000000000003, 0.9937500000000002, 0.9937250000000003]


Random forest accuracy for each graph: [0.9855000000000003, 0.9809374999999998, 0.9937500000000002, 0.9960500000000002, 0.96875]
Random forest balanced accuracy for each graph: [0.9855000000000003, 0.9809374999999998, 0.9937500000000002, 0.9960500000000002, 0.96875]


Random forest accuracy for each graph: [0.9128124999999998, 0.9478999999999999, 0.9580875000000002, 0.9435999999999997, 0.9441624999999999]
Random forest balanced accuracy for each graph: [0.9128124999999998, 0.9478999999999999, 0.9580875000000002, 0.9435999999999997, 0.9441624999999999]


## LPA Final Test

In [39]:
for mu in [2, 3, 4]:
    print('~~~~~~~~~~~~~~~~~~~~~~')
    print('Graphs for mu 0.{0} will have entropy values for:'.format(mu))
    print('~~~~~~~~~~~~~~~~~~~~~~')
    for graph in [1, 2, 3, 4, 5]:
        entrops = pd.read_csv('Community_Data/LPA/Pair_Entropies/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        print('Different Community: Between {0} and {1}'.format(sorted(entropy_values)[0], sorted(entropy_values)[500]))
        print('Same Community: Between {0} and {1}'.format(sorted(entropy_values)[-500], sorted(entropy_values)[-1]))

~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.2 will have entropy values for:
~~~~~~~~~~~~~~~~~~~~~~
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.3 will have entropy values for:
~~~~~~~~~~~~~~~~~~~~~~
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
Different Community: Between 0.0 and 0.0
Same Community: Between 1.0 and 1.0
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.4 w

For LPA at mu 0.4, all but one of the graphs have NO pairs of nodes with a value in the coassociation matrix below 0.5. It doesn't make much sense to classify pairs with a value higher than 0.5 as "different community", because they have been clustered into the same community over half of the time. Therefore I left mu 0.4 out of the LPA experiments.

From further investigation, the reason for this is that LPA sorts the majority of nodes into one giant community at mu 0.4, only putting a small proportion of the nodes into other communities with each run. Thus every pair of nodes ends up being in the same community a majority of the time.

In [40]:
for mu in [2, 3]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/LPA/Results/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', accuracy)
    print('Random forest balanced accuracy for each graph:', balanced_accuracy)

Random forest accuracy for each graph: [0.99985, 1.0, 1.0, 1.0, 1.0]
Random forest balanced accuracy for each graph: [0.99985, 1.0, 1.0, 1.0, 1.0]


Random forest accuracy for each graph: [0.9829124999999999, 1.0, 0.9875000000000003, 0.9938375000000004, 0.9937500000000002]
Random forest balanced accuracy for each graph: [0.9829124999999999, 1.0, 0.9875000000000003, 0.9938375000000004, 0.9937500000000002]
