In [8]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import altair as alt

In [9]:
feature_names = ['Jaccard', 'Common Neighbours', 'Cosine Similarity', 'Shortest Path', 'Max Edge Centrality']

## Infomap Final Test

For the pair node experiments, I want to use a subset of the data because there are so many pairs. For this reason, I sort all the pairs of nodes by their value in the coassociation matrix, and plan to take the 500 with the highest values as the "same community" data, and the 500 with the lowest values as the "different community" data. Before using this method, I'll check what range the 500 highest and lowest values are in.

In [10]:
for mu in [2, 3, 4]:
    print('~~~~~~~~~~~~~~~~~~~~~~')
    print('Graphs for mu 0.{0} will have entropy values:'.format(mu))
    print('~~~~~~~~~~~~~~~~~~~~~~')
    diff_vals = []
    same_vals = []
    for graph in [1, 2, 3, 4, 5]:
        entrops = pd.read_csv('Community_Data/Infomap/Pair_Entropies/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        diff_vals.append(sorted(entropy_values)[500])
        same_vals.append(sorted(entropy_values)[-500])
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        entrops = pd.read_csv('Community_Data/Infomap/Pair_Entropies_Additional/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        diff_vals.append(sorted(entropy_values)[500])
        same_vals.append(sorted(entropy_values)[-500])
    print('Less than (or = to) the following for data points in the "different community" class, by graph:')
    print(diff_vals, '\n')
    print('Greater than (or = to) the following for data points in the "same community" class, by graph:')
    print(same_vals)

~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.2 will have entropy values:
~~~~~~~~~~~~~~~~~~~~~~
Less than (or = to) the following for data points in the "different community" class, by graph:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 

Greater than (or = to) the following for data points in the "same community" class, by graph:
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.3 will have entropy values:
~~~~~~~~~~~~~~~~~~~~~~
Less than (or = to) the following for data points in the "different community" class, by graph:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 

Greater than (or = to) the following for data points in the "same community" class, by graph:
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
~~~~~~~~~~~~~~~~~~~~~~
Graphs fo

Values are always exactly 0 or 1. This means we are only using pairs of nodes which are either always in the same community, or always in different communities.

In [11]:
for mu in [2, 3, 4]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Infomap/Results/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        with open('Community_Data/Infomap/Results_Additional/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', list(np.around(accuracy, decimals=2)))
    print('')
    print('Random forest balanced accuracy for each graph:', list(np.around(accuracy, decimals=2)))

Random forest accuracy for each graph: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99, 0.99, 0.99, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

Random forest balanced accuracy for each graph: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99, 0.99, 0.99, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


Random forest accuracy for each graph: [0.99, 0.97, 0.99, 0.99, 0.99, 1.0, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.98, 0.98, 1.0, 0.98, 1.0, 0.98, 0.99, 0.98]

Random forest balanced accuracy for each graph: [0.99, 0.97, 0.99, 0.99, 0.99, 1.0, 0.99, 0.99, 0.99, 0.98, 0.99, 0.99, 0.98, 0.98, 1.0, 0.98, 1.0, 0.98, 0.99, 0.98]


Random forest accuracy for each graph: [0.95, 0.92, 0.94, 0.93, 0.97, 0.94, 0.97, 0.97, 0.98, 0.98, 0.92, 0.92, 0.96, 0.95, 0.97, 0.92, 0.95, 0.94, 0.96, 0.94]

Random forest balanced accuracy for each graph: [0.95, 0.92, 0.94, 0.93, 0.97, 0.94, 0.97, 0.97, 0.98, 0.98, 0.92, 0.92, 0.96, 0.95, 0.97, 0.92, 0.95, 0.94, 0.96, 0.94]


## Louvain Final Test

In [12]:
for mu in [2, 3, 4]:
    print('~~~~~~~~~~~~~~~~~~~~~~')
    print('Graphs for mu 0.{0} will have entropy values:'.format(mu))
    print('~~~~~~~~~~~~~~~~~~~~~~')
    diff_vals = []
    same_vals = []
    for graph in [1, 2, 3, 4, 5]:
        entrops = pd.read_csv('Community_Data/Louvain/Pair_Entropies/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        diff_vals.append(sorted(entropy_values)[500])
        same_vals.append(sorted(entropy_values)[-500])
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        entrops = pd.read_csv('Community_Data/Louvain/Pair_Entropies_Additional/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        diff_vals.append(sorted(entropy_values)[500])
        same_vals.append(sorted(entropy_values)[-500])
    print('Less than (or = to) the following for data points in the "different community" class, by graph:')
    print(diff_vals, '\n')
    print('Greater than (or = to) the following for data points in the "same community" class, by graph:')
    print(same_vals)

~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.2 will have entropy values:
~~~~~~~~~~~~~~~~~~~~~~
Less than (or = to) the following for data points in the "different community" class, by graph:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 

Greater than (or = to) the following for data points in the "same community" class, by graph:
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.3 will have entropy values:
~~~~~~~~~~~~~~~~~~~~~~
Less than (or = to) the following for data points in the "different community" class, by graph:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 

Greater than (or = to) the following for data points in the "same community" class, by graph:
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
~~~~~~~~~~~~~~~~~~~~~~
Graphs fo

In [13]:
for mu in [2, 3, 4]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/Louvain/Results/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        with open('Community_Data/Louvain/Results_Additional/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', list(np.around(accuracy, decimals=2)))
    print('')
    print('Random forest balanced accuracy for each graph:', list(np.around(accuracy, decimals=2)))

Random forest accuracy for each graph: [1.0, 0.99, 0.98, 1.0, 0.97, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

Random forest balanced accuracy for each graph: [1.0, 0.99, 0.98, 1.0, 0.97, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


Random forest accuracy for each graph: [0.99, 0.99, 1.0, 0.99, 0.99, 0.97, 1.0, 0.98, 0.97, 1.0, 0.96, 1.0, 0.98, 0.98, 0.99, 0.99, 0.98, 1.0, 0.99, 0.99]

Random forest balanced accuracy for each graph: [0.99, 0.99, 1.0, 0.99, 0.99, 0.97, 1.0, 0.98, 0.97, 1.0, 0.96, 1.0, 0.98, 0.98, 0.99, 0.99, 0.98, 1.0, 0.99, 0.99]


Random forest accuracy for each graph: [0.91, 0.95, 0.96, 0.94, 0.94, 0.97, 0.98, 0.98, 0.98, 0.97, 0.96, 0.96, 0.95, 0.96, 0.95, 0.95, 0.91, 0.93, 0.93, 0.96]

Random forest balanced accuracy for each graph: [0.91, 0.95, 0.96, 0.94, 0.94, 0.97, 0.98, 0.98, 0.98, 0.97, 0.96, 0.96, 0.95, 0.96, 0.95, 0.95, 0.91, 0.93, 0.93, 0.96]


## LPA Final Test

In [14]:
for mu in [2, 3, 4]:
    print('~~~~~~~~~~~~~~~~~~~~~~')
    print('Graphs for mu 0.{0} will have entropy values:'.format(mu))
    print('~~~~~~~~~~~~~~~~~~~~~~')
    diff_vals = []
    same_vals = []
    for graph in [1, 2, 3, 4, 5]:
        entrops = pd.read_csv('Community_Data/LPA/Pair_Entropies/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        diff_vals.append(sorted(entropy_values)[500])
        same_vals.append(sorted(entropy_values)[-500])
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        entrops = pd.read_csv('Community_Data/LPA/Pair_Entropies_Additional/graph_0{0}_mu_0_{1}_entropies.csv'.format(graph, mu), 
                              index_col=0)
        entropy_values = np.array(entrops['Entropy'])
        entropy_values = entropy_values.reshape(-1)
        diff_vals.append(sorted(entropy_values)[500])
        same_vals.append(sorted(entropy_values)[-500])
    print('Less than (or = to) the following for data points in the "different community" class, by graph:')
    print(diff_vals, '\n')
    print('Greater than (or = to) the following for data points in the "same community" class, by graph:')
    print(same_vals)

~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.2 will have entropy values:
~~~~~~~~~~~~~~~~~~~~~~
Less than (or = to) the following for data points in the "different community" class, by graph:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 

Greater than (or = to) the following for data points in the "same community" class, by graph:
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
~~~~~~~~~~~~~~~~~~~~~~
Graphs for mu 0.3 will have entropy values:
~~~~~~~~~~~~~~~~~~~~~~
Less than (or = to) the following for data points in the "different community" class, by graph:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] 

Greater than (or = to) the following for data points in the "same community" class, by graph:
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
~~~~~~~~~~~~~~~~~~~~~~
Graphs fo

For LPA at mu 0.4, all but one of the graphs have NO pairs of nodes with a value in the coassociation matrix below 0.5. It doesn't make much sense to classify pairs with a value higher than 0.5 as "different community", because they have been clustered into the same community over half of the time. Therefore I left mu 0.4 out of the LPA experiments.

From further investigation, the reason for this is that LPA sorts the majority of nodes into one giant community at mu 0.4, only putting a small proportion of the nodes into other communities with each run. Thus every pair of nodes ends up being in the same community a majority of the time.

In [18]:
for mu in [2, 3]:
    accuracy = []
    balanced_accuracy = []
    mu_feature_importances = {'Metric': [], 'Value': []}
    for graph in [1, 2, 3, 4, 5]:
        with open('Community_Data/LPA/Results/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    for graph in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]:
        with open('Community_Data/LPA/Results_Additional/mu_0_{0}/graph_0{1}_pair/results'.format(mu, graph), 'rb') as f:
            results = pickle.load(f)
        perm_importances = results['Feature Importances']
        accuracy.append(np.mean(results['Accuracy Scores']))
        balanced_accuracy.append(np.mean(results['Balanced Accuracy Scores']))
        for feat, val in perm_importances.items():
            mu_feature_importances['Metric'].append(feat)
            mu_feature_importances['Value'].append(val)
    mu_feature_importances = pd.DataFrame(data=mu_feature_importances)
    error_bars = alt.Chart(mu_feature_importances).mark_errorbar(extent='ci').encode(
      x=alt.X('Value:Q', scale=alt.Scale(zero=False)),
      y=alt.Y('Metric:N')
    )


    mean_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='black').encode(
      x=alt.X('Value:Q', aggregate='mean'),
      y=alt.Y('Metric:N'),
    )
    
    median_points = alt.Chart(mu_feature_importances).mark_point(filled=True, color='red').encode(
      x=alt.X('Value:Q', aggregate='median'),
      y=alt.Y('Metric:N'),
    )

    final_plot = (error_bars + mean_points + median_points).properties(
        title='Mu 0.{0}'.format(mu)
    )

    final_plot.display()
    
    print('Random forest accuracy for each graph:', list(np.around(accuracy, decimals=2)))
    print('')
    print('Random forest balanced accuracy for each graph:', list(np.around(accuracy, decimals=2)))

Random forest accuracy for each graph: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0, 0.99, 1.0, 1.0, 0.99, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0]

Random forest balanced accuracy for each graph: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0, 0.99, 1.0, 1.0, 0.99, 1.0, 1.0, 1.0, 0.99, 1.0, 1.0]


Random forest accuracy for each graph: [1.0, 0.99, 1.0, 0.98, 0.99, 0.99, 1.0, 0.99, 0.99, 0.99, 0.99, 0.97, 1.0, 0.98, 1.0, 0.99, 1.0, 1.0, 1.0, 1.0]

Random forest balanced accuracy for each graph: [1.0, 0.99, 1.0, 0.98, 0.99, 0.99, 1.0, 0.99, 0.99, 0.99, 0.99, 0.97, 1.0, 0.98, 1.0, 0.99, 1.0, 1.0, 1.0, 1.0]
