In [1]:
import os
import csv
import time
import platform
import datetime
import pandas as pd
import networkx as nx
from graph_partitioning import GraphPartitioning, utils

cols = ["WASTE", "CUT RATIO", "EDGES CUT", "TOTAL COMM VOLUME", "Qds", "CONDUCTANCE", "MAXPERM", "RBSE", "NMI", "FSCORE", "FSCORE RELABEL IMPROVEMENT", "LONELINESS"]

pwd = %pwd


ORDERED_ARRIVALS_DIR = os.path.join(pwd, "data", "ideal_node_ordering", "ordered_centralities")

analysisOnly = True


# [] 15 rankings - minimal binning
# [] 


# parametrized config
parametrized_config = {
    "DATA_FILENAME": os.path.join(pwd, "data", "ideal_node_ordering", "edgelist", "nn#networkID#.txt"),
    "OUTPUT_DIRECTORY": os.path.join(pwd, "output", "ideal_node_ordering"),

    # Set which algorithm is run for the PREDICTION MODEL.
    # Either: 'FENNEL' or 'SCOTCH'
    "PREDICTION_MODEL_ALGORITHM": "PATOH",

    # Alternativly, read input file for prediction model.
    # Set to empty to generate prediction model using algorithm value above.
    "PREDICTION_MODEL": "",

    
    "PARTITIONER_ALGORITHM": "PATOH",

    # File containing simulated arrivals. This is used in simulating nodes
    # arriving at the shelter. Nodes represented by line number; value of
    # 1 represents a node as arrived; value of 0 represents the node as not
    # arrived or needing a shelter.
    "SIMULATED_ARRIVAL_FILE": os.path.join(pwd,
                                           "data",
                                           "predition_model_tests",
                                           "dataset_1_shift_rotate",
                                           "simulated_arrival_list",
                                           "percentage_of_prediction_correct_#correctedness#",
                                           "arrival_#correctedness#_#networkID#.txt"
                                          ),
    
    # File containing the prediction of a node arriving. This is different to the
    # simulated arrivals, the values in this file are known before the disaster.
    "PREDICTION_LIST_FILE": os.path.join(pwd,
                                         "data",
                                         "predition_model_tests",
                                         "dataset_1_shift_rotate",
                                         "prediction_list",
                                         "prediction_#networkID#.txt"
                                        ),

    # File containing the geographic location of each node, in "x,y" format.
    "POPULATION_LOCATION_FILE": os.path.join(pwd,
                                             "data",
                                             "predition_model_tests",
                                             "coordinates",
                                             "coordinates_#networkID#.txt"
                                            ),

    # Number of shelters
    "num_partitions": 4,

    # The number of iterations when making prediction model
    "num_iterations": 12,

    # Percentage of prediction model to use before discarding
    # When set to 0, prediction model is discarded, useful for one-shot
    "prediction_model_cut_off": 0.0,

    # Alpha value used in one-shot (when restream_batches set to 1)
    "one_shot_alpha": 0.5,

    "use_one_shot_alpha" : False,

    # Number of arrivals to batch before recalculating alpha and restreaming.
    # When set to 1, one-shot is used with alpha value from above
    "restream_batches": 50,

    # When the batch size is reached: if set to True, each node is assigned
    # individually as first in first out. If set to False, the entire batch
    # is processed and empty before working on the next batch.
    "sliding_window": False,

    # Create virtual nodes based on prediction model
    "use_virtual_nodes": False,

    # Virtual nodes: edge weight
    "virtual_edge_weight": 1.0,

    # Loneliness score parameter. Used when scoring a partition by how many
    # lonely nodes exist.
    "loneliness_score_param": 1.2,
    
    
    "compute_metrics_enabled": True,

    ####
    # GRAPH MODIFICATION FUNCTIONS

    # Also enables the edge calculation function.
    "graph_modification_functions": True,

    # If set, the node weight is set to 100 if the node arrives at the shelter,
    # otherwise the node is removed from the graph.
    "alter_arrived_node_weight_to_100": False,

    # Uses generalized additive models from R to generate prediction of nodes not
    # arrived. This sets the node weight on unarrived nodes the the prediction
    # given by a GAM.
    # Needs POPULATION_LOCATION_FILE to be set.
    "alter_node_weight_to_gam_prediction": False,

    # The value of 'k' used in the GAM will be the number of nodes arrived until
    # it reaches this max value.
    "gam_k_value": 100,

    # Alter the edge weight for nodes that haven't arrived. This is a way to
    # de-emphasise the prediction model for the unknown nodes.
    "prediction_model_emphasis": 1.0,
    
    # This applies the prediction_list_file node weights onto the nodes in the graph
    # when the prediction model is being computed and then removes the weights
    # for the cutoff and batch arrival modes
    "apply_prediction_model_weights": True,
    
    # Path to the scotch shared library
    "SCOTCH_LIB_PATH": os.path.join(pwd, "libs/scotch/macOS/libscotch.dylib")
    if 'Darwin' in platform.system()
    else "/usr/local/lib/libscotch.so",
    
    # Path to the PaToH shared library
    "PATOH_LIB_PATH": os.path.join(pwd, "libs/patoh/lib/macOS/libpatoh.dylib")
    if 'Darwin' in platform.system()
    else os.path.join(pwd, "libs/patoh/lib/linux/libpatoh.so"),
    
    "PATOH_ITERATIONS": 5,
        
    # Expansion modes: 'avg_node_weight', 'total_node_weight', 'smallest_node_weight'
    # 'largest_node_weight'
    # add '_squared' or '_sqrt' at the end of any of the above for ^2 or sqrt(weight)
    # i.e. 'avg_node_weight_squared
    "PATOH_HYPEREDGE_EXPANSION_MODE": 'no_expansion',
    
    # Edge Expansion: average, total, minimum, maximum, product, product_squared, sqrt_product
    "EDGE_EXPANSION_MODE" : 'total',
    
    # Whether nodes should be reordered using a centrality metric for optimal node assignments in batch mode
    # This is specific to FENNEL and at the moment Leverage Centrality is used to compute new noder orders
    "FENNEL_NODE_REORDERING_ENABLED": False,
    
    # Whether the Friend of a Friend scoring system is active during FENNEL partitioning.
    # FOAF employs information about a node's friends to determine the best partition when
    # this node arrives at a shelter and no shelter has friends already arrived
    "FENNEL_FRIEND_OF_A_FRIEND_ENABLED": False,
    
    # Alters how much information to print. Keep it at 1 for this notebook.
    # 0 - will print nothing, useful for batch operations.
    # 1 - prints basic information on assignments and operations.
    # 2 - prints more information as it batches arrivals.
    "verbose": 0
}

#gp = GraphPartitioning(config)

# Optional: shuffle the order of nodes arriving
# Arrival order should not be shuffled if using GAM to alter node weights
#random.shuffle(gp.arrival_order)

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# load the centralities files
import scipy.stats as sstats
centralities = {}

class CentralitiesExperiment:
    def __init__(self, dirName, dataPath):
        self.dirName = dirName
        self.dataPath = dataPath
        self.outputPath = os.path.join(parametrized_config["OUTPUT_DIRECTORY"], dirName)
                
        parts = dirName.split("_")
        
        self.orderType = parts[len(parts) - 1]
        
        name = dirName.replace('_' + self.orderType, "")
        self.centralityType = name.replace("_", " ")

        self.experimentFileNames = []
        self.experimentFilePaths = []
        
        self.scores = []
        self.avgScores = []
        self.varScores = []
        self.stdScores = []
        self.skewnessScores = []
        self.modeScores = []
        
        self.totalScore = 0.0

    def computeStatsScore(self):
        if(len(self.scores) == 0):
            return
        
        scores = []
        
        self.avgScores = []
        self.varScores = []
        self.stdScores = []
        self.skewnessScores = []
        self.modeScores = []

        for i in range(0, len(self.scores[0])):
            scores.append([])
            self.avgScores.append(0.0)
            self.varScores.append(0.0)
            self.stdScores.append(0.0)
            self.skewnessScores.append(0.0)
            self.modeScores.append(0.0)
        
        for score in self.scores:
            for i, val in enumerate(score):
                scores[i].append(float(val))
        
        for i, data in enumerate(scores):
            data = np.array(data)
            self.avgScores[i] = sstats.tmean(data)
            self.varScores[i] = sstats.tvar(data)
            self.stdScores[i] = sstats.tstd(data)
            self.skewnessScores[i] = sstats.skew(data)
            mode = sstats.mode(data)
            self.modeScores[i] = str(mode[0][0]) + ":" + str(mode[1][0])
                
    def _computeStatsScore(self):
        if(len(self.scores) == 0):
            return
        
        self.avgScores = []
        self.varScores = []
        self.stdScores = []
        
        for i in range(0, len(self.scores[0])):
            self.avgScores.append(0.0)
            self.varScores.append(0.0)
        
        for score in self.scores:
            for i, val in enumerate(score):
                self.avgScores[i] = self.avgScores[i] + float(val)
        
        for i, total in enumerate(self.avgScores):
            self.avgScores[i] = total / len(self.scores)
            
        # compute variance
        for score in self.scores:
            for i, val in enumerate(score):
                mean = self.avgScores[i]
                diffsquared = (float(val) - mean) ** 2
                self.varScores[i] = self.varScores[i] + diffsquared
        
        for i, total in enumerate(self.varScores):
            self.varScores[i] = total / len(self.scores)
            self.stdScores.append(self.varScores[i] ** 0.5)
            
    def printScoreline(self, scoreline):
        print("{0:.5f}\t{1:.10f}\t{2}\t{3}\t{4}\t{5}\t{6}".format(scoreline[0],scoreline[1],scoreline[2],scoreline[3],scoreline[4],scoreline[5],scoreline[6]))
            
    def saveScores(self):
        try:
            os.makedirs(self.outputPath)
        except Exception as e:
            pass
        fName = os.path.join(self.outputPath, "scores.txt")
        with open(fName, 'w+') as f:
            f.write(self.scoreStr(self.avgScores) + "\n")
            f.write(self.scoreStr(self.varScores) + "\n")
            f.write(self.scoreStr(self.stdScores) + "\n")
            f.write(self.scoreStr(self.modeScores) + "\n")
            f.write(self.scoreStr(self.skewnessScores) + "\n")
            
            for score in self.scores:
                f.write(self.scoreStr(score) + "\n")

    def loadScores(self):
        self.scores = []
        fName = os.path.join(self.outputPath, "scores.txt")
        with open(fName, 'r') as f:
            count = 0
            for line in f:
                if(count < 5):
                    count += 1
                    continue
                line = line.strip()
                parts = line.split(',')
                score = []
                for part in parts:
                    score.append(float(part))
                self.scores.append(score)
                
    def scoreStr(self, score):
        s = ""
        for val in score:
            if len(s) > 0:
                s = s + ","
            s = s + str(val)
        return s
            
    def centrality(self):
        return self.centralityType
    
    def ordering(self):
        return self.orderType
    
    def metadata(self):
        return self.centralityType + " centrality, " + self.orderType + " ordering n." + str(self.numExperiments()) + " experiments"
    
    def loadExperimentFiles(self):
        for root, dirs, files in os.walk(self.dataPath):
            for file in files:
                if(file.endswith(".txt")):
                    self.experimentFileNames.append(file.split(".txt")[0])
        centrality.sortExperiments()

    def sortExperiments(self):
        ordered = []
        indeces = {}
        for f in self.experimentFileNames:
            parts = f.split("_")
            index = int(parts[len(parts) - 1])
            indeces[index] = f
        
        self.experimentFileNames = []

        for i, idx in enumerate(sorted(list(indeces.keys()))):
            fn = indeces[idx]
            self.experimentFileNames.append(fn)
            self.experimentFilePaths.append(os.path.join(self.dataPath, fn) + ".txt")

    def numExperiments(self):
        return len(self.experimentFilePaths)
            
    def getDataExperimentPath(self, experimentNumber):
        '''must go from 0 - n'''
        if(experimentNumber >= 0 and experimentNumber < len(self.experimentFilePaths)):
            return self.experimentFilePaths[experimentNumber]
        return ""
    
    def getOutputExperimentPath(self, experimentNumber):
        '''must go from 0 - n'''
        if(experimentNumber >= 0 and experimentNumber < len(self.experimentFileNames)):
            experiment = self.experimentFileNames[experimentNumber].split(".txt")
            outFile = experiment[0] + "_out.txt"
            return os.path.join(self.outputPath, outFile)
        return ""
        
    def print(self):
        print(self.dataPath)
        for i, f in enumerate(self.experimentFileNames):
            print(i," ", f)
            print(self.experimentFilePaths[i])

dataFiles = []
for i in range(1, 41):
    dataFiles.append(parametrized_config['DATA_FILENAME'].replace("#networkID#", str(i)))
# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk(ORDERED_ARRIVALS_DIR):
    for directory in dirs:
        centrality = CentralitiesExperiment(directory, os.path.join(ORDERED_ARRIVALS_DIR, directory))
        centrality.loadExperimentFiles()
        centralities[directory] = centrality
        print(centrality.metadata())


AA centrality, random ordering n.40 experiments
Alpha centrality, HL ordering n.40 experiments
Alpha centrality, LH ordering n.40 experiments
Average distance centrality, HL ordering n.40 experiments
Average distance centrality, LH ordering n.40 experiments
Barycenter centrality centrality, HL ordering n.40 experiments
Barycenter centrality centrality, LH ordering n.40 experiments
Betweenness centrality, HL ordering n.40 experiments
Betweenness centrality, LH ordering n.40 experiments
BottleNeck centrality centrality, HL ordering n.40 experiments
BottleNeck centrality centrality, LH ordering n.40 experiments
Bridging centrality centrality, HL ordering n.40 experiments
Bridging centrality centrality, LH ordering n.40 experiments
Centroid centrality centrality, HL ordering n.40 experiments
Centroid centrality centrality, LH ordering n.40 experiments
Closeness Freeman centrality, HL ordering n.40 experiments
Closeness Freeman centrality, LH ordering n.40 experiments
Closeness VariantLator

In [3]:
# run the experiments here
import pyximport
pyximport.install()
from graph_partitioning import fennel as fnl
from graph_partitioning import scotch_partitioner as sctch
from graph_partitioning import patoh_partitioner as ptoh
from graph_partitioning import GraphPartitioning, utils

fennel = fnl.FennelPartitioner(0.5)
scotch = sctch.ScotchPartitioner(parametrized_config['SCOTCH_LIB_PATH'])
patoh = ptoh.PatohPartitioner(parametrized_config['PATOH_LIB_PATH'], hyperedgeExpansionMode=parametrized_config['PATOH_HYPEREDGE_EXPANSION_MODE'])

def loadGraph(edgeFile):
    G = nx.Graph()
    edges = []
    with open(edgeFile, 'r') as f:
        for line in f:
            line = line.strip()
            line = line.split(" ")
            n1 = int(line[0])
            n2 = int(line[1])
            
            G.add_node(n1)
            G.add_node(n2)
            
            edges.append((n1, n2))
        for edge in edges:
            G.add_edge(edge[0], edge[1])
    nx.set_node_attributes(G, 'weight', 1.0)
    nx.set_edge_attributes(G, 'weight', 1.0)
    return G

def loadArrivals(arrivalFile):
    arrivals = []
    with open(arrivalFile, 'r') as f:
        for line in f:
            line = line.strip()
            arrivals.append(int(line))
    return np.array(arrivals, dtype=np.int32)

def generateArray(num, value):
    arr = []
    for i in range(0, num):
        arr.append(value)
    return np.array(arr, dtype=np.int32)

def checkInputData(graph, arrivals):
    if(graph.number_of_nodes() == 0):
        print("Error, no nodes")
        return False
    if(graph.number_of_edges() == 0):
        print("Error, no edges")
        return False
        
    arr = np.array(arrivals)
    if(np.min(arr) > 0):
        print("Error arrival file has minimum node ID > 0:", np.min(arr))
        return False
    if(np.max(arr) >= graph.number_of_nodes()):
        print("Error arrival file has maximum node ID >= number_of_nodes():", np.max(arr))
        return False

    return True

def computeAlpha(graph, num_partitions):
    numedges = graph.number_of_edges()
    if(graph.is_directed()):
        numedges = numedges * 0.5
    return numedges * (num_partitions / (graph.number_of_nodes()**2))
    
def printScore(graph, assignments, num_partitions, loneliness_score_param, verbose = 1):
        x = utils.score(graph, assignments, num_partitions)
        edges_cut, steps, cut_edges = utils.base_metrics(graph, assignments)

        q_qds_conductance = utils.infomapModularityComQuality(graph, assignments, num_partitions)
        #old: mod = utils.modularity_wavg(graph, assignments, num_partitions)
        loneliness = utils.loneliness_score_wavg(graph, loneliness_score_param, assignments, num_partitions)
        max_perm = utils.wavg_max_perm(graph, assignments, num_partitions)
        #old: max_perm = utils.run_max_perm(graph)

        #nmi_score = nmi_metrics.nmi(np.array([self.assignments_prediction_model, self.assignments]))
        #nmi_score = normalized_mutual_info_score(self.assignments_prediction_model.tolist(), self.assignments.tolist())
        if verbose > 1:
            print("{0:.5f}\t\t{1:.10f}\t{2}\t\t{3}\t\t\t{4}\t{5}\t{6}".format(x[0], x[1], edges_cut, steps, q_qds_conductance[0], loneliness, max_perm))
            #print("{0:.5f}\t\t{1:.10f}\t{2}\t\t{3}\t\t\t{4}\t{5}\t{6}".format(x[0], x[1], edges_cut, steps, mod, loneliness, max_perm))
            #print("{0:.5f}\t\t{1:.10f}\t{2}\t\t{3}\t\t\t{4}\t{5}\t{6}\t{7:.10f}".format(x[0], x[1], edges_cut, steps, mod, loneliness, max_perm, nmi_score))
        # waste, cut_ratio, edges_cut, TCV (steps), Qds, loneliness, max_perm
        return [x[0], x[1], edges_cut, steps, q_qds_conductance[0], loneliness, max_perm]


    
# Run the centralities experiment for eachdatapoint
'''for key in list(centralities.keys()):
    if analysisOnly == True:
        break


    centrality = centralities[key]

    print("Running experiment:", centrality.metadata())

    for i in range(0, 40):
        with GraphPartitioning(parametrized_config) as gp:
            gp.verbose = 0
            gp.DATA_FILENAME = dataFiles[i]
            print(gp.DATA_FILENAME)
            
            gp.load_network()
            gp.init_partitioner()
    
            gp.arrival_order = loadArrivals(centrality.getDataExperimentPath(i))
    
            m = gp.prediction_model()
            m = gp.assign_cut_off()
            m = gp.batch_arrival()
            
            print(m)
        break

analysisOnly = True
'''

# FORMAT OF SAVED scores.txt files

# average (scores)
# variance (scores)
# std (scores)
# mode(scores)
# skewness (scores)
# this is then followed by each experiment's scores over which the stats above are computed
# waste, cut_ratio, edges_cut, TCV (steps), Qds, loneliness, max_perm

for key in list(centralities.keys()):
    if analysisOnly == True:
        break
    
    centrality = centralities[key]

    print("Running experiment:", centrality.metadata())

    for i in range(0, 40):
        edgeFile = dataFiles[i]
        
        G = loadGraph(edgeFile)
        arrival_list = loadArrivals(centrality.getDataExperimentPath(i))
        GSub = G.subgraph(arrival_list)
        
        if checkInputData(G, arrival_list):
            # ok, can proceed
            assignments = generateArray(G.number_of_nodes(), -1)
            fixed = generateArray(G.number_of_nodes(), -1)
            
            if parametrized_config['PARTITIONER_ALGORITHM'] == 'FENNEL':
                assignments = fennel.generate_prediction_model(GSub, parametrized_config['num_iterations'], parametrized_config['num_partitions'], assignments, fixed)

            elif parametrized_config['PARTITIONER_ALGORITHM'] == 'SCOTCH':
                assignments = scotch.generate_prediction_model(GSub, parametrized_config['num_iterations'], parametrized_config['num_partitions'], assignments, fixed)

            elif parametrized_config['PARTITIONER_ALGORITHM'] == 'PATOH':
                assignments = patoh.generate_prediction_model(GSub, parametrized_config['num_iterations'], parametrized_config['num_partitions'], assignments, fixed)

            # score contains: x[0], x[1], edges_cut, steps, mod, loneliness, max_perm
            score = printScore(GSub, assignments, parametrized_config['num_partitions'], parametrized_config['loneliness_score_param'])
            centrality.scores.append(score)
    centrality.computeStatsScore()
    centrality.saveScores()
print("Finished experiments.")

Finished experiments.


In [35]:
# analyse the results

# find min/max for each score
metrics = ["WASTE", "CUT RATIO", "EDGES CUT", "TOTAL COMM VOLUME", "Qds", "LONELINESS", "MAXPERM"]

max_metric_centrality=[]
min_metric_centrality=[]
max_metric = []
min_metric = []
avg_metric = []
#metric_sort_dataset = []
for metric in metrics:
    max_metric_centrality.append("")
    min_metric_centrality.append("")
    max_metric.append(0.0)
    min_metric.append(10000000.0)
    avg_metric.append(0.0)
    metric_sort_dataset = {}

avg_results = {}    
for key in list(centralities.keys()):
    centrality = centralities[key]
    centralityCode = centrality.centralityType + ":" + centrality.orderType
    print("Experiment:", centralityCode)

    centrality.loadScores()
    centrality.computeStatsScore()
    avg_results[centralityCode] = centrality.avgScores

    centrality.printScoreline(centrality.avgScores)

    for i, metric in enumerate(centrality.avgScores):
        if(max_metric[i] < metric):
            max_metric[i] = metric
            max_metric_centrality[i] = centralityCode
        if(min_metric[i] > metric):
            min_metric[i] = metric
            min_metric_centrality[i] = centralityCode
        avg_metric[i] = avg_metric[i] + metric
        # index the score
        #if metric in metric_sort_dataset[i]:
        #    metric_sort_dataset[i][metric].append()

with open(os.path.join(parametrized_config['OUTPUT_DIRECTORY'], "centrality_scores.csv"), 'w+') as f:
    s = "centrality"
    for metric in metrics:
        s = s + "," + metric
    f.write(s + "\n")
    
    for key in list(avg_results.keys()):
        line = key
        for score in avg_results[key]:
            line = line + "," + str(score)
        f.write(line + "\n")
        
for i, avg in enumerate(avg_metric):
    avg_metric[i] = avg / len(centralities)
            
for i, metric in enumerate(metrics):
    print(metric, "metric")
    print("   average =", avg_metric[i])
    print("   min, max =", min_metric[i], max_metric[i])
    print("  ", min_metric_centrality[i], "||", max_metric_centrality[i])


Experiment: AA:random
0.00000	0.3642223887	90.15	95.075	0.40797893778021416	0.7452021271808	0.015190637500000001
Experiment: Alpha:HL
0.00000	0.3647740181	90.275	95.075	0.41115159179477806	0.744545604013975	0.0088029375
Experiment: Alpha:LH
0.00000	0.3658334816	90.525	95.375	0.4053231887936006	0.74230364451105	0.0037204749999999983
Experiment: Average distance:HL
0.00000	0.3645310507	90.2	95.05	0.4075215286242746	0.744011746651	0.0018389312500000012
Experiment: Average distance:LH
0.00000	0.3636613102	89.975	94.7	0.40908385545430875	0.74508042203465	0.017410981249999995
Experiment: Barycenter centrality:HL
0.00000	0.3640772634	90.125	94.9	0.4069757202140485	0.7438894779868999	0.013572856249999996
Experiment: Barycenter centrality:LH
0.00000	0.3654111274	90.425	94.625	0.406124519912213	0.7452016174462999	0.00829180625
Experiment: Betweenness:HL
0.00000	0.3631679285	89.875	94.7	0.40810236534105326	0.7457118069159749	0.006608412499999999
Experiment: Betweenness:LH
0.00000	0.3644296037	90.

In [21]:
# Extract variables for R analysis
y = ''
g = ''

tmpY = ''
tmpG = ''


gkey = {}

modeScore = {} # list of centralities for each modal edges cut value

# this is to set the CONTROL for the R statistical tests (control = first centrality data that has to go in)

# not sure - this would probably be the Leverage_Centrality_HL which has technically
# the best modal score
whichIDToKeepAsZero = 0 #leverage # 90 for Political_independence_index_LH # 0 for random 

for i_key, key in enumerate(list(centralities.keys())):
    if(i_key < whichIDToKeepAsZero):
        gkey[i_key + 1] = key
    elif(i_key == whichIDToKeepAsZero):
        gkey[0] = key
    else:
        gkey[i_key] = key
        
    centrality = centralities[key]

    # extract Edges Cut mode value
    #UNCOMMENT THESE TO ENABLE MODE SCORE
    #modescore = centrality.modeScores[2].split(':')
    #mecut = float(modescore[0]) # value of mode of edges cut
    #mcount = int(modescore[1]) # number of experiments with this modal value of edges cut
    
        
    # store each centrality based on their modal value of edges cut
    #overallmodescore = mecut / mcount
    
    # UNCOMMENT THIS TO ENABLE MODE SCORE
    #overallmodescore = mecut

    ## COMMENT THIS OUT IF WE DON?T WANT AVERAGE!!
    centrality.loadScores()
    centrality.computeStatsScore()
    overallmodescore = centrality.avgScores[2]
    
    if overallmodescore in modeScore:
        modeScore[overallmodescore].append(key)
    else:
        modeScore[overallmodescore] = [key]

    
    
    for i, score in enumerate(centrality.scores):
        edges_cut = score[2]
        
        if(i_key == whichIDToKeepAsZero):
            if(len(tmpY)):
                tmpY += ','
            tmpY += str(edges_cut)
            if(len(tmpG)):
                tmpG += ','
            tmpG += str(0)
            
        else:        
            if(len(y)):
                y += ','
            y += str(edges_cut)
            if(len(g)):
                g += ','
            g += str(i_key)
            #g += '"' + str(i_key) + '"'
            if(i == 40):
                break

y = "Y <- c(" + tmpY + ',' + y + ")"
g = "g <- as.factor(c(" + tmpG + ',' + g + "))"
print(y)
print("")
print(g)

Y <- c(87.0,89.0,80.0,77.0,94.0,92.0,78.0,90.0,82.0,66.0,86.0,82.0,85.0,96.0,87.0,80.0,86.0,75.0,83.0,88.0,94.0,86.0,75.0,94.0,92.0,89.0,84.0,68.0,102.0,80.0,77.0,99.0,86.0,79.0,95.0,77.0,98.0,77.0,77.0,79.0,87.0,89.0,80.0,77.0,94.0,92.0,78.0,90.0,82.0,66.0,86.0,82.0,85.0,96.0,87.0,80.0,86.0,75.0,83.0,88.0,94.0,86.0,75.0,94.0,92.0,89.0,84.0,68.0,102.0,80.0,77.0,99.0,86.0,79.0,95.0,77.0,98.0,77.0,77.0,79.0,87.0,89.0,80.0,77.0,94.0,92.0,78.0,90.0,82.0,66.0,86.0,87.0,89.0,80.0,77.0,94.0,92.0,78.0,90.0,82.0,66.0,86.0,82.0,85.0,96.0,87.0,89.0,87.0,89.0,80.0,77.0,94.0,92.0,78.0,90.0,82.0,66.0,86.0,82.0,85.0,96.0,87.0,80.0,86.0,75.0,83.0,88.0,94.0,86.0,75.0,94.0,92.0,89.0,84.0,68.0,102.0,80.0,77.0,99.0,86.0,79.0,95.0,77.0,98.0,77.0,77.0,79.0,74.0,83.0,79.0,82.0,103.0,107.0,80.0,91.0,85.0,74.0,80.0,85.0,83.0,90.0,68.0,92.0,88.0,85.0,91.0,79.0,87.0,98.0,86.0,86.0,89.0,99.0,83.0,81.0,96.0,81.0,73.0,88.0,87.0,82.0,83.0,88.0,94.0,76.0,86.0,100.0,74.0,86.0,71.0,86.0,75.0,91.0,91.0,76.0,87.0,90.0,73

In [22]:
for key in sorted(modeScore.keys()):
    s = ''
    for c in modeScore[key]:
        if len(s):
            s += ', '
        s += c
    print(key, s)

80.9 BottleNeck_centrality_HL
82.125 Alpha_LH
82.55 MNC_centrality_HL
82.575 Eigenvector_HL, Kleinbergs_centrality_HITS_HL
82.625 Betweenness_HL, Shortest_path_betweenness_HL
82.825 Stress_centrality_HL
82.85 Communicability_betweenness_centrality_HL, Path_centrality_HL, Political_independence_index_LH
83.05 Effectiveness_centrality_HL
83.075 Network_centrality_HL
83.25 Leverage_centrality_HL
83.275 Closeness_VariantLatora_HL
83.3 MCC_centrality_HL
83.625 Lapacian_centrality_HL
83.725 Average_distance_LH
83.775 Load_centrality_HL
83.85 Flow_betweenness_centrality_HL
83.975 Dangalchev_closeness_centrality_HL, Decay_centrality_HL
84.025 Barycenter_centrality_HL, Closeness_Freeman_HL, Lin_centrality_HL, Radiality_centrality_HL, Shortest_path_closeness_HL
84.1 Centroid_centrality_HL
84.175 Community_centrality_HL
84.2 Random_walk_betweenness_HL
84.25 Cross_clique_connectivity_HL, Semi_local_centrality_HL
84.325 Network_fragmentation_GeodesicDistanceWeighted_HL
84.45 Bridging_centrality_LH


In [25]:
k = ''
for key in sorted(gkey.keys()):
    if(len(k)):
        k += ','
    k += '"' + gkey[key] + '"'
    if gkey[key] == 'BottleNeck_centrality_HL':
        print('target best mode centrality:', key)

k = 'k <- c(' + k + ')'
print(k)

target best mode centrality: 10
k <- c("Leverage_centrality_HL","AA_random","Alpha_HL","Alpha_LH","Average_distance_HL","Average_distance_LH","Barycenter_centrality_HL","Barycenter_centrality_LH","Betweenness_HL","Betweenness_LH","BottleNeck_centrality_HL","BottleNeck_centrality_LH","Bridging_centrality_HL","Bridging_centrality_LH","Centroid_centrality_HL","Centroid_centrality_LH","Closeness_Freeman_HL","Closeness_Freeman_LH","Closeness_VariantLatora_HL","Closeness_VariantLatora_LH","ClusterRank_HL","ClusterRank_LH","Communicability_betweenness_centrality_HL","Communicability_betweenness_centrality_LH","Community_centrality_HL","Community_centrality_LH","Core_decomposition_HL","Core_decomposition_LH","Cross_clique_centrality_LH","Cross_clique_connectivity_HL","Current_flow_closeness_centrality_HL","Current_flow_closeness_centrality_LH","Dangalchev_closeness_centrality_HL","Dangalchev_closeness_centrality_LH","Decay_centrality_HL","Decay_centrality_LH","Degree_centrality_HL","Degree_cen

In [26]:
# Extract variables for R analysis
y = ''
g = ''

tmpY = ''
tmpG = ''


gkey = {}

modeScore = {} # list of centralities for each modal edges cut value

# this is to set the CONTROL for the R statistical tests (control = first centrality data that has to go in)

# not sure - this would probably be the Leverage_Centrality_HL which has technically
# the best modal score
whichIDToKeepAsZero = 10

for i_key, key in enumerate(list(centralities.keys())):
    if(i_key < whichIDToKeepAsZero):
        gkey[i_key + 1] = key
    elif(i_key == whichIDToKeepAsZero):
        gkey[0] = key
    else:
        gkey[i_key] = key
        
    centrality = centralities[key]
    
    # extract Edges Cut mode value
    #UNCOMMENT THESE TO ENABLE MODE SCORE
    #modescore = centrality.modeScores[2].split(':')
    #mecut = float(modescore[0]) # value of mode of edges cut
    #mcount = int(modescore[1]) # number of experiments with this modal value of edges cut
    
        
    # store each centrality based on their modal value of edges cut
    #overallmodescore = mecut / mcount
    
    # UNCOMMENT THIS TO ENABLE MODE SCORE
    #overallmodescore = mecut

    ## COMMENT THIS OUT IF WE DON?T WANT AVERAGE!!
    #centrality.loadScores()
    #centrality.computeStatsScore()
    overallmodescore = centrality.avgScores[2]
    
    if overallmodescore in modeScore:
        modeScore[overallmodescore].append(key)
    else:
        modeScore[overallmodescore] = [key]    
    for i, score in enumerate(centrality.scores):
        edges_cut = score[2]
        
        if(i_key == whichIDToKeepAsZero):
            if(len(tmpY)):
                tmpY += ','
            tmpY += str(edges_cut)
            if(len(tmpG)):
                tmpG += ','
            tmpG += str(0)
            
        else:        
            if(len(y)):
                y += ','
            y += str(edges_cut)
            if(len(g)):
                g += ','
            g += str(i_key)
            #g += '"' + str(i_key) + '"'
            if(i == 40):
                break

y = "Y_best <- c(" + tmpY + ',' + y + ")"
g = "g_best <- as.factor(c(" + tmpG + ',' + g + "))"
print(y)
print("")
print(g)

Y_best <- c(82.0,72.0,83.0,79.0,89.0,94.0,91.0,85.0,96.0,73.0,86.0,96.0,82.0,94.0,72.0,76.0,91.0,87.0,93.0,97.0,87.0,90.0,88.0,93.0,81.0,91.0,84.0,79.0,96.0,87.0,86.0,90.0,75.0,82.0,98.0,85.0,92.0,91.0,75.0,79.0,82.0,72.0,83.0,79.0,89.0,94.0,91.0,85.0,96.0,73.0,86.0,96.0,82.0,94.0,72.0,76.0,91.0,87.0,93.0,97.0,87.0,90.0,88.0,93.0,81.0,91.0,84.0,79.0,96.0,87.0,86.0,90.0,75.0,82.0,98.0,85.0,92.0,91.0,75.0,79.0,87.0,89.0,80.0,77.0,94.0,92.0,78.0,90.0,82.0,66.0,86.0,82.0,85.0,96.0,87.0,80.0,86.0,75.0,83.0,88.0,94.0,86.0,75.0,94.0,92.0,89.0,84.0,68.0,102.0,80.0,77.0,99.0,86.0,79.0,95.0,77.0,98.0,77.0,77.0,79.0,87.0,74.0,83.0,79.0,82.0,103.0,107.0,80.0,91.0,85.0,74.0,80.0,85.0,83.0,90.0,68.0,92.0,88.0,85.0,91.0,79.0,87.0,98.0,86.0,86.0,89.0,99.0,83.0,81.0,96.0,81.0,73.0,88.0,87.0,82.0,83.0,88.0,94.0,76.0,86.0,100.0,74.0,86.0,71.0,86.0,75.0,91.0,91.0,76.0,87.0,90.0,73.0,84.0,83.0,76.0,89.0,72.0,80.0,83.0,74.0,87.0,80.0,75.0,96.0,77.0,86.0,82.0,85.0,85.0,77.0,99.0,88.0,78.0,90.0,71.0,82.0,81.0

In [27]:
k = ''
for key in sorted(gkey.keys()):
    if(len(k)):
        k += ','
    k += '"' + gkey[key] + '"'
    if gkey[key] == 'Leverage_centrality_HL':
        print('target best mode centrality:', key)

k = 'k_best <- c(' + k + ')'
print(k)

target best mode centrality: 61
k_best <- c("BottleNeck_centrality_LH","AA_random","Alpha_HL","Alpha_LH","Average_distance_HL","Average_distance_LH","Barycenter_centrality_HL","Barycenter_centrality_LH","Betweenness_HL","Betweenness_LH","BottleNeck_centrality_HL","Bridging_centrality_HL","Bridging_centrality_LH","Centroid_centrality_HL","Centroid_centrality_LH","Closeness_Freeman_HL","Closeness_Freeman_LH","Closeness_VariantLatora_HL","Closeness_VariantLatora_LH","ClusterRank_HL","ClusterRank_LH","Communicability_betweenness_centrality_HL","Communicability_betweenness_centrality_LH","Community_centrality_HL","Community_centrality_LH","Core_decomposition_HL","Core_decomposition_LH","Cross_clique_centrality_LH","Cross_clique_connectivity_HL","Current_flow_closeness_centrality_HL","Current_flow_closeness_centrality_LH","Dangalchev_closeness_centrality_HL","Dangalchev_closeness_centrality_LH","Decay_centrality_HL","Decay_centrality_LH","Degree_centrality_HL","Degree_centrality_LH","Diffusi

In [28]:
# Kendall Tau code

#edges cut, TCV, modularity and loneliness

metricList = []
metricList.append([])  # edges cut
metricList.append([])  # edges cut
metricList.append([])  # edges cut
metricList.append([])  # edges cut


for i_key, key in enumerate(list(centralities.keys())):        
    centrality = centralities[key]
    
    # edges
    modescore = centrality.modeScores[2].split(':')
    metric = float(modescore[0]) # value of mode of edges cut
    metricList[0].append(metric)

    # TCV
    modescore = centrality.modeScores[3].split(':')
    metric = float(modescore[0]) # value of mode of edges cut
    metricList[1].append(metric)    
    
    # Qds
    modescore = centrality.modeScores[4].split(':')
    metric = float(modescore[0]) # value of mode of edges cut
    metricList[2].append(metric)
    
    # loneliness
    modescore = centrality.modeScores[5].split(':')
    metric = float(modescore[0]) # value of mode of edges cut
    metricList[3].append(metric)

taus = []
pvals = []

from scipy.stats import kendalltau

for i in range(0, 4):
    taus.append([])
    pvals.append([])
    for e in range(0, 4):
        tau, pvalue = kendalltau(metricList[i], metricList[e])
        #print(i, e, tau, pvalue)
        taus[i].append(tau)
        pvals[i].append(pvalue)

1.0	0.179643894705	-0.116625988565	0.137879900366
0.179643894705	1.0	-0.0582464256174	0.0155607029261
-0.116625988565	-0.0582464256174	1.0	-0.000613779411357
0.137879900366	0.0155607029261	-0.000613779411357	1.0


In [30]:
for row in taus:
    s = ''
    for item in row:
        if len(s) > 0:
            s += "\t"
        s += str(item)
    print(s)
    
for row in pvals:
    s = ''
    for item in row:
        if len(s) > 0:
            s += "\t"
        s += str(item)
    print(s)

1.0	0.179643894705	-0.116625988565	0.137879900366
0.179643894705	1.0	-0.0582464256174	0.0155607029261
-0.116625988565	-0.0582464256174	1.0	-0.000613779411357
0.137879900366	0.0155607029261	-0.000613779411357	1.0
1.53871593044e-56	0.00442255427753	0.0646226393117	0.0289171943463
0.00442255427753	1.53871593044e-56	0.356072286027	0.805257442494
0.0646226393117	0.356072286027	1.53871593044e-56	0.992240759871
0.0289171943463	0.805257442494	0.992240759871	1.53871593044e-56


In [None]:
# metrics important = 
import matplotlib.pyplot as plt

def findBin(value, edges):
    previousEdge = 0.0
    for i, edge in enumerate(edges):
        if i != 0:
            if value >= previousEdge and value < edge:
                return i
        previousEdge = edge
    return len(edges)

def binScore(bin_id, num_bins, low_is_max = True):
    half = int(num_bins * 0.5)
    diff = 0
    if(low_is_max):
        if(bin_id > half):
            return 0
        diff = (half + 1 - bin_id)
    else:
        if(bin_id <= half):
            return 0
        diff = bin_id - half
    return diff * 25 / half
            
        
stats_metrics = [2, 3, 4, 5] # "EDGES CUT", "TOTAL COMM VOLUME", "MODULARITY", "LONELINESS"
max_to_low_metrics = [4]

means = {}
stds = {}
skews = {}

means_hist = {}
means_binedges = {}
stds_hist = {}
stds_binedges = {}
skews_hist = {}
skews_binedges = {}



for stat in stats_metrics:
    means[stat] = []
    stds[stat] = []
    skews[stat] = []
    means_hist[stat] = {}
    means_binedges[stat] = {}
    stds_hist[stat] = {}
    stds_binedges[stat] = {}
    skews_hist[stat] = {}
    skews_binedges[stat] = {}


for key in list(centralities.keys()):
    centrality = centralities[key]
    centralityCode = centrality.centralityType + ":" + centrality.orderType
    
    centrality.loadScores()
    centrality.computeStatsScore()
    
    for smetric in stats_metrics:
        means[smetric].append(centrality.avgScores[smetric])
        stds[smetric].append(centrality.stdScores[smetric])
        skews[smetric].append(centrality.skewnessScores[smetric])


        
for stat in stats_metrics:
    means[stat] = np.array(means[stat])
    stds[stat] = np.array(stds[stat])
    skews[stat] = np.array(skews[stat])
    
    means_hist[stat], means_binedges[stat] = np.histogram(means[stat], bins='auto')
    stds_hist[stat], stds_binedges[stat] = np.histogram(stds[stat], bins='auto')
    skews_hist[stat], skews_binedges[stat] = np.histogram(skews[stat], bins='auto')

rank = {}

for key in list(centralities.keys()):
    # compute scores for this statistic for each centrality
    centrality = centralities[key]

    centrality.totalScore = 0.0

    for smetric in stats_metrics:
        mu = centrality.avgScores[smetric]
        std = centrality.stdScores[smetric]
        skew = centrality.skewnessScores[smetric]

        mean_bins = len(means_hist[stat])
        std_bins = len(stds_hist[stat])
        skew_bins = len(skews_hist[stat])

        low_is_max = True
        if smetric in max_to_low_metrics:
            low_is_max = False

        # mean should either be max or min
        mu_score = binScore(findBin(mu, means_binedges[smetric]), mean_bins, low_is_max)
        # std score should always be minimized
        std_score = binScore(findBin(std, stds_binedges[smetric]), std_bins)
        skew_score = binScore(findBin(skew, skews_binedges[smetric]), skew_bins, low_is_max)
        
        # override skew score
        skew_score = 0.0
        
        centrality.totalScore += mu_score + std_score + skew_score
    print(centrality.centralityType + ":" + centrality.orderType, centrality.totalScore)
    if centrality.totalScore in rank:
        rank[centrality.totalScore].append(centrality.centralityType + ":" + centrality.orderType)
    else:
        rank[centrality.totalScore] = [centrality.centralityType + ":" + centrality.orderType]

count = 0
for key in sorted(rank, reverse=True):
    for item in rank[key]:
        count += 1
        print(count, key, item)
        if count == 20:
            print("===================")
        
for stat in stats_metrics:
    break
    fig = plt.figure()
    
    ax1 = fig.add_subplot(1,3,1)
    ax1.hist(means[stat])

    ax2 = fig.add_subplot(1,3,2)
    ax2.hist(stds[stat])

    ax3 = fig.add_subplot(1,3,3)
    ax3.hist(skews[stat])


    plt.show()
#plt.show()

In [None]:
randomCentrality = centralities["AA_random"]

mu_stats = {}
std_stats = {}

for stat in stats_metrics:
    mu_stats[stat] = np.mean(means[stat])
    std_stats[stat] = np.std(means[stat])
    
# z - scores
for key in list(centralities.keys()):
    # compute scores for this statistic for each centrality
    centrality = centralities[key]
    if key == "AA_random":
        continue
    
    # compute z-score
    stats_metrics = [2, 3, 4, 5] # "EDGES CUT", "TOTAL COMM VOLUME", "MODULARITY", "LONELINESS"

    print("Z-SCORE: ", centrality.centralityType + ":" + centrality.orderType)
    
    for stat in stats_metrics:
        x = centrality.avgScores[stat]
        
        zscore = (x - mu_stats[stat]) / std_stats[stat]
        tabs = "\t\t"
        if stat == 3:
            tabs = "\t"
        print("   ", cols[stat], tabs, "{0:.5f}".format(zscore), "\t", "{0:.5f}".format(sstats.norm.cdf(zscore)))
    