In [1]:
DATA_FILENAME = '/home/sami/py-graph/data/oneshot_fennel_weights.txt'

# Read input file for prediction model
PREDICTION_MODEL = ''

# Number of shelters
num_partitions = 4

# The number of iterations when making prediction model
num_iterations = 10

# Percentage of prediction model to use before discarding
# When set to 0, prediction model is discarded, useful for one-shot
prediction_model_cut_off = 0.0

# Alpha value used in one-shot (when restream_batches set to 1)
one_shot_alpha = 0.5

# Number of arrivals to batch before recalculating alpha and restreaming.
# When set to 1, one-shot is used with alpha value from above
restream_batches = 10

# Create virtual nodes based on prediction model
use_virtual_nodes = True

# Go to cell 3 to shuffle arrivals

In [2]:
import numpy as np
import networkit
import networkx as nx

# Reading data
# - neither networkit nor networkx handle node weights
# - networkit can read the METIS file format, networkx can't
# - networkit does not support extra attributes to nodes or
#    edges, however they can be added later when writing to
#    a GraphML file format[1]
# - networkx support node and edge attributes, so we can keep
#    the partition assignment with the node and also support
#    virtual nodes without needing to maintain a seperate
#    data structure.
# - the most sensible method for loading the graph data is to
#    read the METIS file with networkit, convert the graph to
#    a networkx graph, then read the METIS file once again
#    and load the node weights into a networkx node attribute
#
# Writing data
# - to be able to write the output data with the partition
#    each node is assigned to, a suitable file format to write
#    to is needed
# - writing to a METIS file will lose the partition assignments
# - if we use networkit to write the data, then the only function
#    available is GraphMLWriter()
# - networkx provides a richer set of output methods which
#    preserve the partition assignment
# - using networkit to write GML data causes a loss of edge weights and node weights
# - using networkx to write GML data preserves node and edge weights
# [1]: https://networkit.iti.kit.edu/data/uploads/docs/NetworKit-Doc/python/html/graphio.html#networkit.graphio.GraphMLWriter

# read METIS file
print("Loading graph data...")
nkG = networkit.graphio.METISGraphReader().read(DATA_FILENAME)

# convert to networkx Graph
G = networkit.nxadapter.nk2nx(nkG)

# add node weights from METIS file
with open(DATA_FILENAME, "r") as metis:
    
    # read meta data from first line
    first_line = next(metis).split()
    m_nodes = int(first_line[0])
    m_edges = int(first_line[1])

    for i, line in enumerate(metis):
        if line.strip():
            weight = line.split()[0]
            G.add_nodes_from([i], weight=str(weight))
        else:
            # blank line indicates no node weight
            G.add_nodes_from([i], weight=0.0)

edges = np.array(G.edges(), dtype=np.int32)
edge_weights = np.array([x[2]['weight'] for x in G.edges(data=True)], dtype=np.float32)
node_weights = np.array([x[1]['weight'] for x in G.nodes(data=True)], dtype=np.float32)

# sanity check
assert (m_nodes == G.number_of_nodes())
assert (m_nodes == len(node_weights))
assert (m_edges == G.number_of_edges())
assert (m_edges == len(edge_weights))
assert (m_edges == len(edges))

print("Nodes: {}".format(G.number_of_nodes()))
print("Edges: {}".format(G.number_of_edges()))



Loading graph data...
Nodes: 1000
Edges: 2939


In [3]:
# Order of people arriving
arrivals = list(range(0, G.number_of_nodes()))
#random.shuffle(arrivals)

# Alpha value used in prediction model
prediction_model_alpha = G.number_of_edges() * (num_partitions / G.number_of_nodes()**2)

In [4]:
%load_ext Cython
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
%%cython
import numpy as np
import networkx as nx
from shared import bincount_assigned

cdef int UNMAPPED = -1

def get_votes(graph, int node, float[::] edge_weights, int num_partitions, int[::] partition):
    seen = set()
    cdef float[::] partition_votes = np.zeros(num_partitions, dtype=np.float32)

    # find all neighbors from whole graph
    node_neighbors = list(nx.all_neighbors(graph, node))
    node_neighbors = [x for x in node_neighbors if x not in seen and not seen.add(x)]

    # calculate votes based on neighbors placed in partitions
    for n in node_neighbors:
        if partition[n] != UNMAPPED:
            partition_votes[partition[n]] += edge_weights[n]
            
    return partition_votes

def get_assignment(int node,
                   float[::] node_weights,
                   int num_partitions,
                   int[::] partition,
                   float[::] partition_votes,
                   float alpha,
                   int debug):

    cdef int arg = 0
    cdef int max_arg = 0
    cdef float max_val = 0
    cdef float val = 0
    cdef int previous_assignment = 0

    assert partition is not None, "Blank partition passed"

    cdef float[::] partition_sizes = np.zeros(num_partitions, dtype=np.float32)
    s = bincount_assigned(partition, num_partitions, weights=node_weights)
    partition_sizes = np.fromiter(s, dtype=np.float32)
    
    if debug:
        print("Assigning node {}".format(node))
        print("\tPn = Votes - Alpha x Size")

    # Remember placement of node in the previous assignment
    previous_assignment = partition[node]

    max_arg = 0
    max_val = partition_votes[0] - alpha * partition_sizes[0]
    if debug:
        print("\tP{} = {} - {} x {} = {}".format(0,
                                                 partition_votes[0],
                                                 alpha,
                                                 partition_sizes[0],
                                                 max_val))

    if previous_assignment == 0:
        # We remove the node from its current partition before
        # deciding to re-add it, so subtract alpha to give
        # result of 1 lower partition size.
        max_val += alpha

    for arg in range(1, num_partitions):
        val = partition_votes[arg] - alpha * partition_sizes[arg]

        if debug:
            print("\tP{} = {} - {} x {} = {}".format(arg,
                                                     partition_votes[arg],
                                                     alpha,
                                                     partition_sizes[arg],
                                                     val))
        if previous_assignment == arg:
            # See comment above
            val += alpha
        if val > max_val:
            max_arg = arg
            max_val = val

    if debug:
        print("\tassigned to P{}".format(max_arg))

    return max_arg

def fennel_rework(graph, 
                  float[::] edge_weights,
                  float[::] node_weights,
                  int num_partitions,
                  int[::] assignments,
                  int[::] fixed,
                  float alpha,
                  int debug):

    single_nodes = []
    for n in range(0, graph.number_of_nodes()):

        # Exclude single nodes, deal with these later
        neighbors = list(nx.all_neighbors(graph, n))
        if not neighbors:
            single_nodes.append(n)
            continue
            
        # Skip fixed nodes
        if fixed[n] != UNMAPPED:
            if debug:
                print("Skipping node {}".format(n))
            continue

        partition_votes = get_votes(graph, n, edge_weights, num_partitions, assignments)
        assignments[n] = get_assignment(n, node_weights, num_partitions, assignments, partition_votes, alpha, debug)

    # Assign single nodes
    for n in single_nodes:
        if assignments[n] == UNMAPPED:
            parts = bincount_assigned(assignments, num_partitions)
            smallest = parts.index(min(parts))
            assignments[n] = smallest

    return np.asarray(assignments)

In [6]:
import shared
UNMAPPED = -1

# reset
assignments = np.repeat(np.int32(UNMAPPED), len(node_weights))
fixed = np.repeat(np.int32(UNMAPPED), len(node_weights))

print("PREDICTION MODEL")
print("----------------\n")
print("WASTE\t\tCUT RATIO\tMISMATCH")

if PREDICTION_MODEL:
    with open(PREDICTION_MODEL, "r") as inf:
        assignments = np.fromiter(inf.readlines(), dtype=np.int32)
    x = shared.score(assignments, edges)
    print("{0:.5f}\t\t{1:.10f}\t{2}".format(x[0], x[1], x[2]))

else:
    for i in range(num_iterations):
        alpha = prediction_model_alpha
        assignments = fennel_rework(G, edge_weights, node_weights, num_partitions, assignments, fixed, alpha, 0)

        x = shared.score(assignments, edges)
        print("{0:.5f}\t\t{1:.10f}\t{2}".format(x[0], x[1], x[2]))

print("\nAssignments:")
shared.fixed_width_print(assignments)

nodes_fixed = len([o for o in fixed if o == 1])
print("\nFixed: {}".format(nodes_fixed))

shared.print_partitions(assignments, num_partitions, node_weights)

PREDICTION MODEL
----------------

WASTE		CUT RATIO	MISMATCH
0.00000		0.2177611432	640
0.03200		0.1429057503	420
0.00000		0.1272541681	374
0.00000		0.1248724056	367
0.00000		0.1224906431	360
0.00000		0.1224906431	360
0.00000		0.1224906431	360
0.00000		0.1224906431	360
0.00000		0.1224906431	360
0.00000		0.1224906431	360

Assignments:
[ 0  1  2  0  1  0  1  3  0  0  2  0  0  1  0  2  3  2  2  0  0  3  2  3  0  1  3  1  2  0  0  2  0  1  2  3  0  3  2  1  2  0  2  1  0  3  1  3  3  2  0  1  2  0  0  2  3  0  1  0  2  1  3  1  1  1  1  2  3  2  1  0  0  1  0  3  1  1  0  1  2  3  1  0  1  2  1  2  3  0  1  3  3  0  1  2  3  0  0  1  1  2  3  1  1  0  1  0  2  0  2  1  2  2  3  1  3  1  0  2  1  0  0  3  1  1  3  2  2  3  0  0  1  0  0  3  1  2  3  1  1  2  3  2  3  2  1  2  0  0  3  1  1  2  1  2  1  3  3  0  1  3  0  3  0  2  3  2  3  1  0  1  0  1  2  1  0  2  1  1  2  0  0  0  1  0  2  1  1  2  3  2  2  0  0  3  1  2  3  0  2  1  0  3  2  2  2  1  2  2  1  0  3  0  3  0  0  1  2  3  0  

In [7]:
if use_virtual_nodes:
    print("Creating virtual nodes and assigning edges based on prediction model")

    # create virtual nodes
    virtual_nodes = list(range(G.number_of_nodes(), G.number_of_nodes() + num_partitions))
    print("\nVirtual nodes:")
    print(virtual_nodes)

    # create virtual edges
    virtual_edges = []
    for n in range(0, G.number_of_nodes()):
        virtual_edges += [(n, virtual_nodes[assignments[n]])]

    # extend assignments
    assignments = np.append(assignments, np.array(list(range(0, num_partitions)), dtype=np.int32))
    fixed = np.append(fixed, np.array([1] * num_partitions, dtype=np.int32))

    G.add_nodes_from(virtual_nodes, weight=1.0)
    G.add_edges_from(virtual_edges, weight=1.0)

    edges = np.array(G.edges(), dtype=np.int32)
    edge_weights = np.array([x[2]['weight'] for x in G.edges(data=True)], dtype=np.float32)
    node_weights = np.array([x[1]['weight'] for x in G.nodes(data=True)], dtype=np.float32)

    print("\nAssignments:")
    shared.fixed_width_print(assignments)
    print("Last {} nodes are virtual nodes.".format(num_partitions))

Creating virtual nodes and assigning edges based on prediction model

Virtual nodes:
[1000, 1001, 1002, 1003]

Assignments:
[ 0  1  2  0  1  0  1  3  0  0  2  0  0  1  0  2  3  2  2  0  0  3  2  3  0  1  3  1  2  0  0  2  0  1  2  3  0  3  2  1  2  0  2  1  0  3  1  3  3  2  0  1  2  0  0  2  3  0  1  0  2  1  3  1  1  1  1  2  3  2  1  0  0  1  0  3  1  1  0  1  2  3  1  0  1  2  1  2  3  0  1  3  3  0  1  2  3  0  0  1  1  2  3  1  1  0  1  0  2  0  2  1  2  2  3  1  3  1  0  2  1  0  0  3  1  1  3  2  2  3  0  0  1  0  0  3  1  2  3  1  1  2  3  2  3  2  1  2  0  0  3  1  1  2  1  2  1  3  3  0  1  3  0  3  0  2  3  2  3  1  0  1  0  1  2  1  0  2  1  1  2  0  0  0  1  0  2  1  1  2  3  2  2  0  0  3  1  2  3  0  2  1  0  3  2  2  2  1  2  2  1  0  3  0  3  0  0  1  2  3  0  3  1  0  2  2  2  2  1  2  3  3  1  1  3  3  2  1  0  3  2  2  2  3  3  2  2  1  1  3  0  2  0  3  3  0  1  2  1  1  3  1  0  2  3  1  3  2  3  3  1  0  2  3  0  1  0  1  2  3  2  3  1  2  0  3  2  0  0  1  1  3

In [8]:
cut_off_value = int(prediction_model_cut_off * G.number_of_nodes())
if prediction_model_cut_off == 0:
    print("Discarding prediction model\n")
else:
    print("Assign first {} arrivals using prediction model, then discard\n".format(cut_off_value))

# fix arrivals
for a in arrivals:
    nodes_fixed = len([o for o in fixed if o == 1])
    if nodes_fixed >= cut_off_value:
        break
    fixed[a] = 1

# remove nodes not fixed, ie. discard prediction model
for i in range(0, len(assignments)):
    if fixed[i] == -1:
        assignments[i] = -1

print("WASTE\t\tCUT RATIO\tMISMATCH")
x = shared.score(assignments, edges, num_partitions)
print("{0:.5f}\t\t{1:.10f}\t{2}".format(x[0], x[1], x[2]))

print("\nAssignments:")
shared.fixed_width_print(assignments)

nodes_fixed = len([o for o in fixed if o == 1])
print("\nFixed: {}".format(nodes_fixed))

shared.print_partitions(assignments, num_partitions, node_weights)

Discarding prediction model

WASTE		CUT RATIO	MISMATCH
0.00000		0.2538715410	1000

Assignments:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 

In [9]:
if restream_batches == 1:
    print("One-shot assignment mode")
    print("------------------------\n")
else:
    print("Re-streaming in batches of {}".format(restream_batches))
    print("--------------------------------\n")

batch_arrived = []
print("WASTE\t\tCUT RATIO\tMISMATCH\tALPHA")
for a in arrivals:
    # check if node is already arrived
    if fixed[a] == 1:
        continue

    # one-shot assigment: assign each node as it arrives
    if restream_batches == 1:
        alpha = one_shot_alpha
        partition_votes = get_votes(G, a, edge_weights, num_partitions, assignments)
        assignments[a] = get_assignment(a, node_weights, num_partitions, assignments, partition_votes, alpha, 0)
        fixed[a] = 1
        
        x = shared.score(assignments, edges, num_partitions)
        print("{0:.5f}\t\t{1:.10f}\t{2}\t\t{3:.10f}".format(x[0], x[1], x[2], alpha))
        continue
        
    batch_arrived.append(a)

    if restream_batches == len(batch_arrived):

        # make a subgraph of all arrived nodes
        nodes_arrived = []
        for n in range(0, len(assignments)):
            if fixed[n] == 1 or n in batch_arrived:
                nodes_arrived.append(n)
        Gsub = G.subgraph(nodes_arrived)

        # recalculate alpha
        if Gsub.is_directed():
            # as it's a directed graph, edges_arrived is actually double, so divide by 2
            edges_arrived = Gsub.number_of_edges() / 2
        else:
            edges_arrived = Gsub.number_of_edges()
        nodes_fixed = len([o for o in fixed if o == 1])
        alpha = (edges_arrived) * (num_partitions / (nodes_fixed + len(batch_arrived))**2)

        # restream
        for n in batch_arrived:
            partition_votes = get_votes(Gsub, n, edge_weights, num_partitions, assignments)
            assignments[n] = get_assignment(n, node_weights, num_partitions, assignments, partition_votes, alpha, 0)
            fixed[n] = 1

        x = shared.score(assignments, edges, num_partitions)
        print("{0:.5f}\t\t{1:.10f}\t{2}\t\t{3:.10f}".format(x[0], x[1], x[2], alpha))
        batch_arrived = []

# remove nodes not fixed
for i in range(0, len(assignments)):
    if fixed[i] == -1:
        assignments[i] = -1

print("\nAssignments:")
shared.fixed_width_print(assignments)

nodes_fixed = len([o for o in fixed if o == 1])
print("\nFixed: {}".format(nodes_fixed))

shared.print_partitions(assignments, num_partitions, node_weights)

Re-streaming in batches of 10
--------------------------------

WASTE		CUT RATIO	MISMATCH	ALPHA
0.00996		0.2645341457	1042		0.2040816327
0.01594		0.2746890074	1082		0.1458333333
0.01793		0.2863670982	1128		0.1211072664
0.01992		0.3008377761	1185		0.0971074380
0.01793		0.3117542523	1228		0.0850480110
0.02789		0.3249555725	1280		0.0703125000
0.01793		0.3315562325	1306		0.0620891161
0.02390		0.3432343234	1352		0.0561224490
0.02191		0.3551662859	1399		0.0511543685
0.02390		0.3673521198	1447		0.0469674556
0.02590		0.3754760091	1479		0.0433979686
0.01992		0.3843615131	1514		0.0411030177
0.02191		0.3924854024	1546		0.0385386500
0.02390		0.3975628332	1566		0.0368441358
0.02191		0.4026402640	1586		0.0364311014
0.02789		0.4122873826	1624		0.0352468769
0.02590		0.4176186849	1645		0.0338221694
0.03586		0.4244732166	1672		0.0330812854
0.03785		0.4333587205	1707		0.0323094909
0.03187		0.4402132521	1734		0.0309496348
0.02988		0.4447829398	1752		0.0303956677
0.02789		0.4523990861	1782		0.0297353316
0.

In [10]:
if use_virtual_nodes:
    print("Remove virtual nodes")
    
    print("\nCurrent graph:")
    print("Nodes: {}".format(G.number_of_nodes()))
    print("Edges: {}".format(G.number_of_edges()))

    G.remove_nodes_from(virtual_nodes)
    assignments = np.delete(assignments, virtual_nodes)
    fixed = np.delete(fixed, virtual_nodes)

    print("\nVirtual nodes removed:")
    print("Nodes: {}".format(G.number_of_nodes()))
    print("Edges: {}".format(G.number_of_edges()))

Remove virtual nodes

Current graph:
Nodes: 1004
Edges: 3939

Virtual nodes removed:
Nodes: 1000
Edges: 2939


In [11]:
# Add partition attribute to nodes and write to file
for i in range(0, len(assignments)):
    G.add_nodes_from([i], partition=str(assignments[i]))
nx.write_gml(G, "test.gml")

# print out assignments into a file with a single column
with open("assignments.txt", "w") as outf:
    for i in range(0, len(assignments)):
        outf.write("{}\n".format(assignments[i]))


# XXX: create metrics
# XXX: read 1000 input graphs, write to 1000 output GML, flat file, and stats
# XXX: read the 1000 stats files