# Partition SBM Baseline Code

In [None]:
# License

#
# Copyright 2017 MIT Lincoln Laboratory, Massachusetts Institute of Technology
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use these files except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#

"""
Based on the Python script by

Authors: Steven Smith, Edward Kao
Date: 9 January 2017
Installation: Python 2.7

Description: This Python script performs the baseline graph partition algorithm based on the degree-corrected stochastic block model.

References:
Peixoto, Tiago P. "Entropy of stochastic blockmodel ensembles." Physical Review E 85, no. 5 (2012): 056122.
Peixoto, Tiago P. "Parsimonious module inference in large networks." Physical review letters 110, no. 14 (2013): 148701.
Karrer, Brian, and Mark EJ Newman. "Stochastic blockmodels and community structure in networks." Physical Review E 83, no. 1 (2011): 016107.
"""

In [None]:
using LightGraphs
using SimpleWeightedGraphs
import StatsBase: pweights, sample, countmap
using Distributions

### Loading the Graph

We load the graph into a LightGraphs graph. We do this by reading the edges to be added and adding them onto the graph.

In [None]:
const INPUT_PATH = "data/streaming/"

In [4]:
"""
Load graph given the base filename and the number of the streaming peice to be added onto a given graph.
"""
function load_graph!(g::SimpleWeightedDiGraph, sampling_type::String, num_vertices::Int64, streaming_num::Int64=1)
    if sampling_type == "snowballSampling"
        sample_name = "snowball"
    else
        sample_name = "edgeSample"
    end
    filename = joinpath(
            INPUT_PATH, sampling_type, "$(num_vertices)_nodes", 
            "simulated_blockmodel_graph_$(num_vertices)_nodes_$(sample_name)_$(streaming_num).tsv")
    edgePieces = readdlm(filename)
    for i = 1:size(edgePieces, 1)
        success = add_edge!(g, edgePieces[i, 1], edgePieces[i, 2], edgePieces[i, 3])
        if success == false
            throw("Error adding edges.")
        end
    end
end

load_graph!

In [5]:
import Base.Test.@test
function test_load_graph(num_nodes::Int64)
    g = SimpleWeightedDiGraph(num_nodes)
    edges = 0
    for chunk_num = 1:9
        load_graph!(g, "snowballSampling", num_nodes, chunk_num)
        @test nv(g) == num_nodes
        edges += countlines(joinpath(
                INPUT_PATH, "snowballSampling", "$(num_nodes)_nodes", 
                "simulated_blockmodel_graph_$(num_nodes)_nodes_snowball_$(chunk_num).tsv"))
        @test ne(g) == edges
    end
end
    

test_load_graph (generic function with 1 method)

In [6]:
test_load_graph(1000)



### Initializing the edge counts

In [7]:
"""
Initializes the edge count matrix M between the blocks. 
Calculates the new out, in and total degrees for the updated edge count matrix.
Returns a tuple of M, d_out, d_in, d
"""
function initialize_edge_counts(g::SimpleWeightedDiGraph, B::Int64, b::Vector{Int64})
    M = zeros(Int64, B, B) # create a zero matrix of B x B 
    for edge in edges(g)
            M[b[dst(edge)], b[src(edge)]] += weight(edge)
    end
    # Sum across rows to get the outdegrees for each block
    d_out = reshape(sum(M, 1), B)
    # Sum across cols to get the indegrees for each block
    d_in = reshape(sum(M, 2), B)
    d = d_out + d_in
    return M, d_out, d_in, d
end

initialize_edge_counts

In [8]:
function test_initialize_counts(num_nodes::Int64)
    g = SimpleWeightedDiGraph(num_nodes)
    load_graph!(g, "snowballSampling", num_nodes, 1)
    M, d_out, d_in = initialize_edge_counts(g, num_nodes, collect(1:num_nodes))
    for edge in edges(g)
        @test M[dst(edge), src(edge)] == 1
    end
    for v in vertices(g)
        @test outdegree(g, v) == d_out[v]
        @test indegree(g, v) == d_in[v]
    end
end

test_initialize_counts (generic function with 1 method)

In [9]:
test_initialize_counts(1000)

## Propose a new block assignment for the current node or block

### Parameters
    r : Int64
            current block assignment for the node under consideration
    neighbors_out : Array{Int64, 2}, has 2 columns.
            out neighbors for the block
    neighbors_in : Array{Int64, 2}, has 2 columns.
            in neighbors for the block
    b : Vector{Int64}
        array of block assignment for each node
    M : Array{Int64, 2}, size is (B, B)
            edge count matrix between all the blocks.
    d : Vector{Int}
            total number of edges to and from each block
    B : Int64
            total number of blocks
    agg_move : Bool
            whether the proposal is a block move

### Returns
    s : int
            proposed block assignment for the node under consideration
    k_out : int
            the out degree of the node
    k_in : int
            the in degree of the node
    k : int
            the total degree of the node

### Notes
- $d_u$: degree of block u

Randomly select a neighbor of the current node, and obtain its block assignment $u$. With probability $\frac{B}{d_u + B}$, randomly propose
a block. Otherwise, randomly selects a neighbor to block $u$ and propose its block assignment. For block (agglomerative) moves,
avoid proposing the current block.

In [10]:
function propose_new_partition(
        r::Int64, neighbors_out::Array{Int64, 2}, neighbors_in::Array{Int64, 2}, b::Vector{Int64}, M::Array{Int64, 2},
        d::Vector{Int64}, B::Int64, agg_move::Bool
    )
    neighbors = vcat(neighbors_out, neighbors_in)
    k_out = sum(neighbors_out[:, 2])
    k_in = sum(neighbors_in[:, 2])
    k = k_out + k_in
    rand_neighbor = sample(neighbors[:, 1], pweights(neighbors[:,2]/k))
    u = b[rand_neighbor]
    # propose a new block randomly
    prob = B/(d[u] + B)
    #@show B, d[u], prob
    random_drawn = rand()
    if random_drawn <= prob  # chance inversely prop. to block_degree
        #println("Picking new block randomly")
        if agg_move  # force proposal to be different from current block
            candidates = Set(1:B)
            pop!(candidates, r)
            s = sample(collect(candidates))
        else
            s = rand(1:B)
        end
    else  # propose by random draw from neighbors of block partition[rand_neighbor]
        #println("random draw from neighbors of block partition[$(rand_neighbor)]")
        multinomial_prob = (M[:, u] + M[u, :]) / d[u]
        if agg_move  # force proposal to be different from current block
            multinomial_prob[r] = 0
            if sum(multinomial_prob) == 0  # the current block has no neighbors. randomly propose a different block
                candidates = Set(1:B)
                pop!(candidates, r)
                s = sample(collect(candidates))
                return s, k_out, k_in, k
            else
                multinomial_prob = multinomial_prob / sum(multinomial_prob)
            end
        end
        if (Distributions.isprobvec(multinomial_prob) == false)
            @show multinomial_prob[findn(multinomial_prob)], sum(multinomial_prob)
            @show M[:, u][findn(M[:, u])], M[u, :][findn(M[u, :])]
            @show ((M[:, u] + M[u, :]) / d[u])[findn((M[:, u] + M[:, u]) / d[u])]
            @show d[u]
            @show sum(M[:, u][findn(M[:, u])]) + sum(M[u, :][findn(M[u, :])])
        end
        #multinomial_prob = multinomial_prob / sum(multinomial_prob)
        candidates_vec =  findn(multinomial_prob)
        s = candidates_vec[findn(rand(Multinomial(1, multinomial_prob[candidates_vec])))[1]]
    end
    return s, k_out, k_in, k
end

propose_new_partition (generic function with 1 method)

## Compute the two new rows and cols of the edge count matrix under the proposal for the current node or block

### Parameters
    M : ndarray or sparse matrix (int), shape = (#blocks, #blocks)
            edge count matrix between all the blocks.
    r : int
            current block assignment for the node under consideration
    s : int
            proposed block assignment for the node under consideration
    b_out : ndarray (int)
            blocks of the out neighbors
    count_out : ndarray (int)
            edge counts to the out neighbor blocks
    b_in : ndarray (int)
            blocks of the in neighbors
    count_in : ndarray (int)
            edge counts to the in neighbor blocks
    count_self : int
            edge counts to self
    agg_move : bool
            whether the proposal is a block move
    use_sparse : bool
            whether the edge count matrix is stored as a sparse matrix

### Returns
    M_r_row : ndarray or sparse matrix (int)
            the current block row of the new edge count matrix under proposal
    M_s_row : ndarray or sparse matrix (int)
            the proposed block row of the new edge count matrix under proposal
    M_r_col : ndarray or sparse matrix (int)
            the current block col of the new edge count matrix under proposal
    M_s_col : ndarray or sparse matrix (int)
            the proposed block col of the new edge count matrix under proposal

### Notes
The updates only involve changing the entries to and from the neighboring blocks

In [23]:
function compute_new_rows_cols_interblock_edge_count_matrix(
        M::Array{Int64, 2}, r::Int64, s::Int64, b_out::Vector{Int64}, count_out::Vector{Int64}, 
        b_in::Vector{Int64}, count_in::Vector{Int64}, count_self::Int64, agg_move::Bool)
    B = size(M, 1)
    if agg_move
        M_r_row = zeros(Int64, B)
        M_r_col = zeros(Int64, B)
    else
        M_r_row = copy(M[:, r])
        M_r_col = copy(M[r, :])
        
        M_r_col[b_out] -= count_out
        M_r_col[r] -= sum(count_in[findin(b_in, r)])
        M_r_col[s] += sum(count_in[findin(b_in, r)])
        
        M_r_row[b_in] -= count_in
        M_r_row[r] -= sum(count_out[findin(b_out, r)])
        M_r_row[s] += sum(count_out[findin(b_out, r)])
    end
    M_s_col = copy(M[s, :])
    M_s_row = copy(M[:, s])
    
    M_s_col[b_out] += count_out
    M_s_col[r] -= count_self
    M_s_col[s] += count_self
    M_s_col[r] -= sum(count_in[findin(b_in, s)])
    M_s_col[s] += sum(count_in[findin(b_in, s)])
    
    M_s_row[b_in] += count_in
    M_s_row[r] -= sum(count_out[findin(b_out, r)])
    M_s_row[s] += sum(count_out[findin(b_out, r)])
    M_s_row[r] -= count_self
    M_s_row[s] += count_self
    
    return M_r_row, M_s_row, M_r_col, M_s_col
end

compute_new_rows_cols_interblock_edge_count_matrix (generic function with 1 method)

## Compute the new block degrees under the proposal for the current node or block

### Parameters
    r : int
            current block assignment for the node under consideration
    s : int
            proposed block assignment for the node under consideration
    d_out : ndarray (int)
            the current out degree of each block
    d_in : ndarray (int)
            the current in degree of each block
    d : ndarray (int)
            the current total degree of each block
    k_out : int
            the out degree of the node
    k_in : int
            the in degree of the node
    k : int
            the total degree of the node

### Returns
    d_out_new : ndarray (int)
            the new out degree of each block under proposal
    d_in_new : ndarray (int)
            the new in degree of each block under proposal
    d_new : ndarray (int)
            the new total degree of each block under proposal

### Notes
The updates only involve changing the degrees of the current and proposed block

In [24]:
function compute_new_block_degrees(
        r::Int64, s::Int64, d_out::Vector{Int64}, d_in::Vector{Int64}, d::Vector{Int64}, 
        k_out::Int64, k_in::Int64, k::Int64
    )
    new_degrees = [copy(degrees) for degrees in [d_out, d_in, d]]
    for (new_d, degree) in zip(new_degrees, [k_out, k_in, k])
        new_d[r] -= degree
        new_d[s] += degree
    end
    return new_degrees
end

compute_new_block_degrees (generic function with 1 method)

## Compute change in entropy under the proposal
Reduction in entropy means the proposed block is better than the current block.

### Parameters
    r : int
            current block assignment for the node under consideration
    s : int
            proposed block assignment for the node under consideration
    M : ndarray or sparse matrix (int), shape = (#blocks, #blocks)
            edge count matrix between all the blocks.
    M_r_row : ndarray or sparse matrix (int)
            the current block row of the new edge count matrix under proposal
    M_s_row : ndarray or sparse matrix (int)
            the proposed block row of the new edge count matrix under proposal
    M_r_col : ndarray or sparse matrix (int)
            the current block col of the new edge count matrix under proposal
    M_s_col : ndarray or sparse matrix (int)
            the proposed block col of the new edge count matrix under proposal
    d_out : ndarray (int)
            the current out degree of each block
    d_in : ndarray (int)
            the current in degree of each block
    d_out_new : ndarray (int)
            the new out degree of each block under proposal
    d_in_new : ndarray (int)
            the new in degree of each block under proposal
    use_sparse : bool
            whether the edge count matrix is stored as a sparse matrix

### Returns
    delta_entropy : float
            entropy under the proposal minus the current entropy

### Notes
- $M^-$: current edge count matrix between the blocks
- $M^+$: new edge count matrix under the proposal
- $d^-_{t, \rm in}$: current in degree of block $t$
- $d^-_{t, \rm out}$: current out degree of block $t$
- $d^+_{t, \rm in}$: new in degree of block $t$ under the proposal
- $d^+_{t, \rm out}$: new out degree of block $t$ under the proposal

The difference in entropy is computed as:

$\large \Delta S = \sum_{t_1, t_2} {\left[ -M_{t_1 t_2}^+ \log\left(\frac{M_{t_1 t_2}^+}{d_{t_1, \rm in}^+ d_{t_2, \rm out}^+}\right) + M_{t_1 t_2}^- \log\left(\frac{M_{t_1 t_2}^-}{d_{t_1, \rm in}^- d_{t_2, \rm out}^-}\right)\right]}$

where the sum runs over all entries $(t_1, t_2)$ in rows and cols $r$ and $s$ of the edge count matrix

In [25]:
function compute_delta_entropy(
        r::Int64, s::Int64, M::Array{Int64, 2}, 
        M_r_col::Vector{Int64}, M_s_col::Vector{Int64}, M_r_row::Vector{Int64},
        M_s_row::Vector{Int64}, d_out::Vector{Int64}, d_in::Vector{Int64}, d_out_new::Vector{Int64}, 
        d_in_new::Vector{Int64}
    )
    delta = 0.0
    # Sum over col of r in new M
    for t1 in findn(M_r_col)
        # Skip if t1 is r or s to prevent double counting
        if t1 ∈ (r, s)
            continue
        end
        delta -= M_r_col[t1] * log(M_r_col[t1] / d_in_new[t1] / d_out_new[r])
    end
    for t1 in findn(M_s_col)
        if t1 ∈ (r, s)
            continue
        end
        delta -= M_s_col[t1] * log(M_s_col[t1] / d_in_new[t1] / d_out_new[s])
    end
    # Sum over row of r in new M
    for t2 in findn(M_r_row)
        delta -= M_r_row[t2] * log(M_r_row[t2] / d_in_new[r] / d_out_new[t2])
    end
    # Sum over row of s in new M
    for t2 in findn(M_s_row)
        delta -= M_s_row[t2] * log(M_s_row[t2] / d_in_new[s] / d_out_new[t2])
    end
    # Sum over columns in old M
    for t2 in (r, s)
        for t1 in findn(M[:, t2])
            # Skip if t1 is r or s to prevent double counting
            if t1 ∈ (r, s)
                continue
            end
            delta += M[t1, t2] * log(M[t1, t2] / d_in[t1] / d_out[t2])
        end
    end
    # Sum over rows in old M
    for t1 in (r, s)
        for t2 in findn(M[t1, :])
            delta += M[t1, t2] * log(M[t1, t2] / d_in[t1] / d_out[t2])
        end
    end
    return delta
end

compute_delta_entropy (generic function with 1 method)

## Execute the best merge (agglomerative) moves to reduce a set number of blocks

### Parameters
    delta_entropy_for_each_block : ndarray (float)
            the delta entropy for merging each block
    best_merge_for_each_block : ndarray (int)
            the best block to merge with for each block
    b : ndarray (int)
            array of block assignment for each node
    B : int
            total number of blocks in the current partition
    B_to_merge : int
            the number of blocks to merge

### Returns
    b : ndarray (int)
            array of new block assignment for each node after the merge
    B : int
            total number of blocks after the merge

In [26]:
function carry_out_best_merges(
        delta_entropy_for_each_block::Vector{Float64}, best_merge_for_each_block::Vector{Int64}, 
        b::Vector{Int64}, B::Int64, B_to_merge::Int64
    )
    bestMerges = sortperm(delta_entropy_for_each_block)
    block_map = collect(1:B)
    num_merge = 0
    counter = 1
    while num_merge < B_to_merge
        mergeFrom = bestMerges[counter]
        mergeTo = block_map[best_merge_for_each_block[bestMerges[counter]]]
        counter += 1
        if mergeTo != mergeFrom
            block_map[findin(block_map, mergeFrom)] = mergeTo
            b[findin(b, mergeFrom)] = mergeTo
            num_merge += 1
        end
    end
    remaining_blocks = unique(b)
    mapping = -ones(Int64, B)
    mapping[remaining_blocks] = collect(1:length(remaining_blocks))
    b = mapping[b]
    B -= B_to_merge
    return b, B
end

carry_out_best_merges (generic function with 1 method)


### Compute the overall entropy for the current partition
Compute the overall entropy, including the model entropy as well as the data entropy, on the current partition. The best partition with an optimal number of blocks will minimize this entropy.

### Parameters

M : ndarray or sparse matrix (int), shape = (#blocks, #blocks)
        edge count matrix between all the blocks.
        
d_out : ndarray (int)
        the current out degrees of each block
        
d_in : ndarray (int)
        the current in degrees of each block
        
B : int
        the number of blocks in the partition
        
N : int
        number of nodes in the graph
        
E : int
        number of edges in the graph
        

### Returns

S : float
        the overall entropy of the current partition

### Notes

$M$: current edge count matrix

$d_{t, \rm out}$: current out degree of block $t$

$d_{t, \rm in}$: current in degree of block $t$

$N$: number of nodes

$E$: number of edges

$B$: number of blocks

$C$: some constant invariant to the partition

The overall entropy of the partition is computed as:

$\large S = E\;h\left(\frac{B^2}{E}\right) + N \log(B) - \sum_{t_1, t_2} {M_{t_1 t_2} \log\left(\frac{M_{t_1 t_2}}{d_{t_1, \rm in} d_{t_2, \rm out}}\right)} + C$

where the function $h(x)=(1+x)\log(1+x) - x\log(x)$ and the sum runs over all entries $(t_1, t_2)$ in the edge count matrix

In [27]:
function compute_overall_entropy(M::Array{Int64, 2}, d_out::Vector{Int64}, d_in::Vector{Int64}, B::Int64, N::Int64, E::Int64)
    rows, cols = findn(M)  # all non-zero entries
    summation_term = 0.0
    for col in cols
        for row in rows
            summation_term -= M[row, col] * log(M[row, col]/ d_in[row] / d_out[col])
        end
    end
    model_S_term = B^2 / E
    model_S = E * (1 + model_S_term) * log(1 + model_S_term) - model_S_term * log(model_S_term) + N*log(B)
    S = model_S + summation_term
    return S
end

compute_overall_entropy (generic function with 1 method)

## Compute the Hastings correction for the proposed block from the current block

### Parameters
    b_out : ndarray (int)
            blocks of the out neighbors
    count_out : ndarray (int)
            edge counts to the out neighbor blocks
    b_in : ndarray (int)
            blocks of the in neighbors
    count_in : ndarray (int)
            edge counts to the in neighbor blocks
    s : int
            proposed block assignment for the node under consideration
    M : ndarray or sparse matrix (int), shape = (#blocks, #blocks)
            edge count matrix between all the blocks.
    M_r_row : ndarray or sparse matrix (int)
            the current block row of the new edge count matrix under proposal
    M_r_col : ndarray or sparse matrix (int)
            the current block col of the new edge count matrix under proposal
    B : int
            total number of blocks
    d : ndarray (int)
            total number of edges to and from each block
    d_new : ndarray (int)
            new block degrees under the proposal
    use_sparse : bool
            whether the edge count matrix is stored as a sparse matrix

### Returns
    Hastings_correction : float
            term that corrects for the transition asymmetry between the current block and the proposed block
### Notes
- $p_{i, s \rightarrow r}$ : for node $i$, probability of proposing block $r$ if its current block is $s$
- $p_{i, r \rightarrow s}$ : for node $i$, probability of proposing block $s$ if its current block is $r$
- $r$ : current block for node $i$
- $s$ : proposed block for node $i$
- $M^-$: current edge count matrix between the blocks
- $M^+$: new edge count matrix under the proposal
- $d^-_t$: current degree of block $t$
- $d^+_t$: new degree of block $t$ under the proposal
- $\mathbf{b}_{\mathcal{N}_i}$: the neighboring blocks to node $i$
- $k_i$: the degree of node $i$
- $k_{i,t}$ : the degree of node $i$ to block $t$ (i.e. number of edges to and from block $t$)
- $B$ : the number of blocks

The Hastings correction is: 

$\huge \frac{p_{i, s \rightarrow r}}{p_{i, r \rightarrow s}}$

where

$\Large p_{i, r \rightarrow s} = \sum_{t \in \{\mathbf{b}_{\mathcal{N}_i}^-\}} \left[ {\frac{k_{i,t}}{k_i} \frac{M_{ts}^- + M_{st}^- + 1}{d^-_t+B}}\right]$

$\Large p_{i, s \rightarrow r} = \sum_{t \in \{\mathbf{b}_{\mathcal{N}_i}^-\}} \left[ {\frac{k_{i,t}}{k_i} \frac{M_{tr}^+ + M_{rt}^+ +1}{d_t^++B}}\right]$

summed over all the neighboring blocks $t$

In [28]:
function compute_Hastings_correction(
        b_out::Vector{Int64}, count_out::Vector{Int64}, b_in::Vector{Int64}, count_in::Vector{Int64}, 
        s::Int64, M::Array{Int64, 2}, M_r_row::Vector{Int64}, M_r_col::Vector{Int64}, B::Int64, 
        d::Vector{Int64}, d_new::Vector{Int64}
    )
    block_degree_map = countmap(vcat(b_out, b_in), Distributions.weights(vcat(count_out, count_in)))
    p_forward = 0.0
    p_backward = 0.0
    for (t, degree) in block_degree_map
        p_forward += degree * (M[t, s] + M[s, t] + 1) / (d[t] + B)
        p_backward += degree * (M_r_row[t] + M_r_col[t] + 1) / (d_new[t] + B)
    end
    return p_backward / p_forward
end

compute_Hastings_correction (generic function with 1 method)

## Move the current node to the proposed block and update the edge counts

### Parameters
    b : ndarray (int)
            current array of new block assignment for each node
    ni : int
            current node index
    r : int
            current block assignment for the node under consideration
    s : int
            proposed block assignment for the node under consideration
    M : ndarray or sparse matrix (int), shape = (#blocks, #blocks)
            edge count matrix between all the blocks.
    M_r_row : ndarray or sparse matrix (int)
            the current block row of the new edge count matrix under proposal
    M_s_row : ndarray or sparse matrix (int)
            the proposed block row of the new edge count matrix under proposal
    M_r_col : ndarray or sparse matrix (int)
            the current block col of the new edge count matrix under proposal
    M_s_col : ndarray or sparse matrix (int)
            the proposed block col of the new edge count matrix under proposal
    d_out_new : ndarray (int)
            the new out degree of each block under proposal
    d_in_new : ndarray (int)
            the new in degree of each block under proposal
    d_new : ndarray (int)
            the new total degree of each block under proposal
    use_sparse : bool
            whether the edge count matrix is stored as a sparse matrix
### Returns
    b : ndarray (int)
            array of block assignment for each node after the move
    M : ndarray or sparse matrix (int), shape = (#blocks, #blocks)
            edge count matrix between all the blocks after the move
    d_out_new : ndarray (int)
            the out degree of each block after the move
    d_in_new : ndarray (int)
            the in degree of each block after the move
    d_new : ndarray (int)
            the total degree of each block after the move

In [29]:
function update_partition(b::Vector{Int64}, ni::Int64, r::Int64, s::Int64, M::Array{Int64, 2}, M_r_row::Vector{Int64}, 
    M_s_row::Vector{Int64}, M_r_col::Vector{Int64}, M_s_col::Vector{Int64}, d_out_new::Vector{Int64}, 
    d_in_new::Vector{Int64}, d_new::Vector{Int64})
    b[ni] = s
    M[r, :] = M_r_row
    M[s, :] = M_s_row
    M[:, r] = M_r_col
    M[:, s] = M_s_col
    return b, M, d_out_new, d_in_new, d_new
    
end

update_partition (generic function with 1 method)

In [30]:
function main(sampling_type::String, num_vertices::Int64)
    #input_filename = '../../data/static/simulated_blockmodel_graph_500_nodes'
    #true_partition_available = true
    #visualize_graph = True # whether to plot the graph layout colored with intermediate partitions
    #verbose = True # whether to print updates of the partitioning
    
    # Create the graph
    g = SimpleWeightedDiGraph(num_vertices)
    # Load the first part of the graph
    load_graph!(g, sampling_type, num_vertices, 1)
    info(g, "Loaded")

    # initialize by putting each node in its own block (N blocks)
    num_blocks = num_vertices
    partition = collect(1:num_vertices)

    # partition update parameters
    β = 3 # exploitation versus exploration (higher value favors exploitation)

    # agglomerative partition update parameters
    num_agg_proposals_per_block = 10 # number of proposals per block
    num_block_reduction_rate = 0.5 # fraction of blocks to reduce until the golden ratio bracket is established

    # nodal partition updates parameters
    max_num_nodal_itr = 100 # maximum number of iterations
    delta_entropy_threshold1 = 5e-4 # stop iterating when the change in entropy falls below this fraction of the overall entropy
                                    # lowering this threshold results in more nodal update iterations and likely better performance, but longer runtime
    delta_entropy_threshold2 = 1e-4 # threshold after the golden ratio bracket is established (typically lower to fine-tune to partition) 
    delta_entropy_moving_avg_window = 3 # width of the moving average window for the delta entropy convergence criterion

    # initialize edge counts and block degrees
    interblock_edge_count, block_degrees_out, block_degrees_in, block_degrees =
        initialize_edge_counts(g, num_blocks, partition)

    # initialize items before iterations to find the partition with the optimal number of blocks
    optimal_B_found = false
    old_b = [[], [], []]  # partition for the high, best, and low number of blocks so far
    old_M = [[], [], []]  # edge count matrix for the high, best, and low number of blocks so far
    old_d = [[], [], []]  # block degrees for the high, best, and low number of blocks so far
    old_d_out = [[], [], []]  # out block degrees for the high, best, and low number of blocks so far
    old_d_in = [[], [], []]  # in block degrees for the high, best, and low number of blocks so far
    old_S = [Inf, Inf, Inf] # overall entropy for the high, best, and low number of blocks so far
    old_B = [[], [], []]  # number of blocks for the high, best, and low number of blocks so far
    
    
    num_blocks_to_merge = floor(Int64, num_blocks*num_block_reduction_rate)

    # begin partitioning by finding the best partition with the optimal number of blocks
 #   while optimal_num_blocks_found == false
        # begin agglomerative partition updates (i.e. block merging)
        println("\nMerging down blocks from $num_blocks to $(num_blocks - num_blocks_to_merge)")
        
        best_merge_for_each_block = fill(-1, num_blocks) # initialize to no merge
        delta_entropy_for_each_block = fill(Inf, num_blocks) # initialize criterion
        block_partition = collect(1:num_blocks)
        for current_block in 1:num_blocks # evalaute agglomerative updates for each block
            for proposal_idx in 1:num_agg_proposals_per_block
                # populate edges to neighboring blocks
                out_blocks = findn(interblock_edge_count[:, current_block])
                out_blocks_counts = hcat(out_blocks, interblock_edge_count[out_blocks, current_block])
                in_blocks = findn(interblock_edge_count[current_block, :])
                in_blocks_counts = hcat(in_blocks, interblock_edge_count[current_block, in_blocks])
            
                if (length(out_blocks) + length(in_blocks) == 0)
                    #println("No neighbors found for $(current_block). Skipping.")
                    best_merge_for_each_block[current_block] = current_block
                    continue
                end
                # propose a new block to merge with
                proposal, num_out_neighbor_edges, num_in_neighbor_edges, num_neighbor_edges =
                    propose_new_partition(
                        current_block, out_blocks_counts, in_blocks_counts, block_partition, 
                        interblock_edge_count, block_degrees, num_blocks, true
                    )

                # compute the two new rows and columns of the interblock edge count matrix
                new_interblock_edge_count_current_block_row, new_interblock_edge_count_new_block_row, new_interblock_edge_count_current_block_col, new_interblock_edge_count_new_block_col = 
                    compute_new_rows_cols_interblock_edge_count_matrix(
                        interblock_edge_count, current_block, proposal, 
                        out_blocks_counts[:, 1], out_blocks_counts[:, 2],
                        in_blocks_counts[:, 1], in_blocks_counts[:, 2], 
                        interblock_edge_count[current_block, current_block], 
                        true
                )    
                
                # compute new block degrees           
                block_degrees_out_new, block_degrees_in_new, block_degrees_new = 
                    compute_new_block_degrees(
                        current_block, proposal, block_degrees_out, block_degrees_in, block_degrees, 
                        num_out_neighbor_edges, num_in_neighbor_edges, num_neighbor_edges
                )

                # compute change in entropy / posterior
                delta_entropy = compute_delta_entropy(
                    current_block, proposal, interblock_edge_count, 
                    new_interblock_edge_count_current_block_col, new_interblock_edge_count_new_block_col,
                    new_interblock_edge_count_current_block_row, new_interblock_edge_count_new_block_row,
                    block_degrees_out, block_degrees_in, block_degrees_out_new, block_degrees_in_new
                )
                
                #@show current_block, proposal, delta_entropy
                if delta_entropy < delta_entropy_for_each_block[current_block] # a better block candidate was found
                    best_merge_for_each_block[current_block] = proposal
                    delta_entropy_for_each_block[current_block] = delta_entropy
                end
            end
            #info("$(current_block) done")
        end
        # carry out the best merges
        
        partition, num_blocks = carry_out_best_merges(
                    delta_entropy_for_each_block, best_merge_for_each_block, partition, 
                    num_blocks, num_blocks_to_merge
        )

        # re-initialize edge counts and block degrees
        interblock_edge_count, block_degrees_out, block_degrees_in, block_degrees = initialize_edge_counts(
                    g, num_blocks, partition)
        
        # perform nodal partition updates
        println("Beginning nodal updates")
        total_num_nodal_moves = 0            
        itr_delta_entropy = zeros(max_num_nodal_itr)

        # compute the global entropy for MCMC convergence criterion
        overall_entropy = compute_overall_entropy(
                    interblock_edge_count, block_degrees_out, block_degrees_in, num_blocks, nv(g), ne(g)
                )
    
        for itr in 1:max_num_nodal_itr
            num_nodal_moves = 0
            itr_delta_entropy[itr] = 0.0
            println("$itr, $(num_nodal_moves)")
            for current_node in vertices(g)
                current_block = partition[current_node] 
                # propose a new block for this node
                # FIXME: weights are floats
                out_blocks = out_neighbors(g, current_node)
                out_blocks_counts = hcat(
                    out_blocks, 
                    [round(Int64,get_weight(g, current_node, n)) for n in out_blocks]
                )
                in_blocks = in_neighbors(g, current_node)
                in_blocks_counts = hcat(
                    in_blocks, 
                    [round(Int64,get_weight(g, n, current_node)) for n in in_blocks]
                )
                #@show out_blocks, out_blocks_counts, in_blocks, in_blocks_counts
                if (length(out_blocks) + length(in_blocks) == 0)
                    #println("No neighbors found for $(current_block). Skipping.")
                    continue
                end
                proposal, num_out_neighbor_edges, num_in_neighbor_edges, num_neighbor_edges = propose_new_partition(
                            current_block, out_blocks_counts, in_blocks_counts, partition, 
                            interblock_edge_count, block_degrees, num_blocks, false
                        )
                
                # determine whether to accept or reject the proposal
                if (proposal != current_block)
                    # compute block counts of in and out neighbors
                    out_blocks_count_map = countmap(
                        partition[out_neighbors(g, current_node)], 
                        Distributions.weights([round(Int64, get_weight(g, current_node, n)) for n in out_neighbors(g, current_node)])
                    )
                    in_blocks_count_map = countmap(
                        partition[in_neighbors(g, current_node)], 
                        Distributions.weights([round(Int64, get_weight(g, n, current_node)) for n in in_neighbors(g, current_node)])
                    )
                    
                    out_blocks_counts = zeros(Int64, length(keys(out_blocks_count_map)), 2)
                    in_blocks_counts = zeros(Int64, length(keys(in_blocks_count_map)), 2)
                    
                    for (idx, (block, count)) in enumerate(out_blocks_count_map)
                        out_blocks_counts[idx, 1] = block
                        out_blocks_counts[idx, 2] = count
                    end
                
                    for (idx, (block, count)) in enumerate(in_blocks_count_map)
                        in_blocks_counts[idx, 1] = block
                        in_blocks_counts[idx, 2] = count
                    end
                    
                    #@show out_blocks_counts, partition[out_neighbors(g, current_node)] 
                    #@show in_blocks_counts, partition[in_neighbors(g, current_node)]
                    # compute the two new rows and columns of the interblock edge count matrix
                    if has_edge(g, current_node, current_node)
                        self_edge_weight = get_weight(g, current_node, current_node)
                        println("self_edge_weight of $(self_edge_weight)")
                    else
                        self_edge_weight = 0
                    end
                
                    new_interblock_edge_count_current_block_row, new_interblock_edge_count_new_block_row, new_interblock_edge_count_current_block_col, new_interblock_edge_count_new_block_col = 
                        compute_new_rows_cols_interblock_edge_count_matrix(
                        interblock_edge_count, current_block, proposal, 
                        out_blocks_counts[:, 1], out_blocks_counts[:, 2],
                        in_blocks_counts[:, 1], in_blocks_counts[:, 2], 
                        self_edge_weight, false
                    )
                    #@show new_interblock_edge_count_current_block_row[in_blocks[:, 1]], 
                    #new_interblock_edge_count_current_block_col[out_blocks[:, 1]]
                    @show find(x->x<0, new_interblock_edge_count_current_block_row)
                    @show find(x->x<0, new_interblock_edge_count_new_block_row)
                    @show find(x->x<0, new_interblock_edge_count_current_block_row)
                    # compute new block degrees           
                    block_degrees_out_new, block_degrees_in_new, block_degrees_new = compute_new_block_degrees(
                        current_block, proposal, block_degrees_out, block_degrees_in, block_degrees, 
                        num_out_neighbor_edges, num_in_neighbor_edges, num_neighbor_edges
                    )
                  # compute the Hastings correction
                    Hastings_correction = compute_Hastings_correction(
                        out_blocks_counts[:, 1], out_blocks_counts[:, 2], in_blocks_counts[:, 1], in_blocks_counts[:, 2], 
                        proposal, interblock_edge_count, new_interblock_edge_count_current_block_row, 
                        new_interblock_edge_count_current_block_col, num_blocks, block_degrees, block_degrees_new
                    )

                    # compute change in entropy / posterior
                    delta_entropy = compute_delta_entropy(current_block, proposal, interblock_edge_count,
                    new_interblock_edge_count_current_block_col, new_interblock_edge_count_new_block_col,
                    new_interblock_edge_count_current_block_row, new_interblock_edge_count_new_block_row, 
                    block_degrees_out, block_degrees_in, block_degrees_out_new, block_degrees_in_new)
                  end
            end
        end
#=
                    # compute probability of acceptance
                    p_accept = min((e^(-β*delta_entropy))*Hastings_correction, 1)

                    # if accept the proposal, update the partition, inter_block_edge_count, and block degrees
                    if (rand() <= p_accept)
                        total_num_nodal_moves += 1
                        num_nodal_moves += 1
                        itr_delta_entropy[itr] += delta_entropy
                        println("Updating edge count for $(current_node) - $(current_block) to $(proposal)")
                        partition, interblock_edge_count, block_degrees_out, block_degrees_in, block_degrees = update_partition(
                            partition, current_node, current_block, proposal, interblock_edge_count,
                            new_interblock_edge_count_current_block_row, new_interblock_edge_count_new_block_row, 
                            new_interblock_edge_count_current_block_col, new_interblock_edge_count_new_block_col, 
                            block_degrees_out_new, block_degrees_in_new, block_degrees_new
                        )
                        for b_test in 1:num_blocks
                            d = sum(interblock_edge_count[:, b_test] + interblock_edge_count[b_test, :])
                            self_edge = interblock_edge_count[b_test, b_test]
                            if (d != block_degrees_new[b_test] )
                                println("$(b_test) inconsistent - $d, $(block_degrees_new[b_test]), $(self_edge)")
                            end
                        end
                    end

            if verbose:
                print("Itr: {}, number of nodal moves: {}, delta S: {:0.5f}".format(itr, num_nodal_moves, itr_delta_entropy[itr]/float(overall_entropy)))
            if itr>=(delta_entropy_moving_avg_window-1): # exit MCMC if the recent change in entropy falls below a small fraction of the overall entropy
                if not(np.all(np.isfinite(old_overall_entropy))): # golden ratio bracket not yet established 
                    if (-np.mean(itr_delta_entropy[(itr-delta_entropy_moving_avg_window+1):itr]) < (delta_entropy_threshold1*overall_entropy)):
                        break
                else: # golden ratio bracket is established. Fine-tuning partition.
                    if (-np.mean(itr_delta_entropy[(itr-delta_entropy_moving_avg_window+1):itr]) < (delta_entropy_threshold2*overall_entropy)):
                        break
                
        # compute the global entropy for determining the optimal number of blocks
        overall_entropy = compute_overall_entropy(interblock_edge_count, block_degrees_out, block_degrees_in, num_blocks, N, E, use_sparse_matrix)

        if verbose:
            print("Total number of nodal moves: {}, overall_entropy: {:0.2f}".format(total_num_nodal_moves, overall_entropy))
        if visualize_graph & use_graph_tool_options:
            graph_object = plot_graph_with_partition(out_neighbors, partition, graph_object)

        # check whether the partition with optimal number of block has been found; if not, determine and prepare for the next number of blocks to try
        partition, interblock_edge_count, block_degrees, block_degrees_out, block_degrees_in, num_blocks, num_blocks_to_merge, old_partition, old_interblock_edge_count, old_block_degrees, old_block_degrees_out, old_block_degrees_in, old_overall_entropy, old_num_blocks, optimal_num_blocks_found = \
            prepare_for_partition_on_next_num_blocks(overall_entropy, partition, interblock_edge_count, block_degrees, block_degrees_out, block_degrees_in, num_blocks, old_partition, old_interblock_edge_count, old_block_degrees, old_block_degrees_out, old_block_degrees_in, old_overall_entropy, old_num_blocks, num_block_reduction_rate)

        if verbose:
            print('Overall entropy: {}'.format(old_overall_entropy))
            print('Number of blocks: {}'.format(old_num_blocks))
            if optimal_num_blocks_found:
                print('\nOptimal partition found with {} blocks'.format(num_blocks))
    if use_timeit:
        t1 = timeit.default_timer()
        print('\nGraph partition took {} seconds'.format(t1-t0))

    # evaluate output partition against the true partition
    evaluate_partition(true_partition, partition)
"""
=#
end

main (generic function with 1 method)

In [31]:
main("emergingEdges", 1000)

[1m[36mINFO: [39m[22m[36m{1000, 1978} directed simple Int64 graph with Float64 weightsLoaded
[39m


Merging down blocks from 1000 to 500
Beginning nodal updates
1, 0


LoadError: DomainError:
log will only return a complex result if called with a complex argument. Try log(complex(x)).

In [20]:
function foo()
num_nodes = 1000
g = SimpleWeightedDiGraph(1000)
partition = collect(1:num_nodes)
num_blocks = num_nodes
load_graph!(g, "emergingEdges", 1000, 3)
current_block = 1
block_partition = collect(1:num_blocks)
interblock_edge_count, block_degrees_out, block_degrees_in, block_degrees = 
        initialize_edge_counts(g, num_blocks, partition)
                # populate edges to neighboring blocks
                out_blocks = findn(interblock_edge_count[:, current_block])
                out_blocks_counts = hcat(out_blocks, interblock_edge_count[out_blocks, current_block])
                in_blocks = findn(interblock_edge_count[current_block, :])
                in_blocks_counts = hcat(in_blocks, interblock_edge_count[current_block, in_blocks])

                # propose a new block to merge with
                proposal, num_out_neighbor_edges, num_in_neighbor_edges, num_neighbor_edges = 
                    propose_new_partition(
                        current_block, out_blocks_counts, in_blocks_counts, block_partition, 
                        interblock_edge_count, block_degrees, num_blocks, true
                    )
                println(proposal)
                new_interblock_edge_count_current_block_row, new_interblock_edge_count_new_block_row, new_interblock_edge_count_current_block_col, new_interblock_edge_count_new_block_col = 
                    compute_new_rows_cols_interblock_edge_count_matrix(
                    interblock_edge_count, current_block, proposal, out_blocks_counts[:, 1], out_blocks_counts[:, 2],
                    in_blocks_counts[:, 1], in_blocks_counts[:, 2], interblock_edge_count[current_block, current_block], 
                    true
                )
                @show findn.([new_interblock_edge_count_new_block_col, new_interblock_edge_count_new_block_row])
              # compute new block degrees           
                block_degrees_out_new, block_degrees_in_new, block_degrees_new = 
                    compute_new_block_degrees(
                        current_block, proposal, block_degrees_out, block_degrees_in, block_degrees, 
                        num_out_neighbor_edges, num_in_neighbor_edges, num_neighbor_edges
                )
                delta_entropy = compute_delta_entropy(
                    current_block, proposal, interblock_edge_count, 
                    new_interblock_edge_count_current_block_col, new_interblock_edge_count_new_block_col,
                    new_interblock_edge_count_current_block_row, new_interblock_edge_count_new_block_row,
                    block_degrees_out, block_degrees_in, block_degrees_out_new, block_degrees_in_new
                )
                
                println(delta_entropy)
end

foo (generic function with 1 method)

In [None]:
foo()

In [None]:
sum(p)

In [None]:
sum(sum.(([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1])))

In [None]:
@code_warntype main("emergingEdges", 1000)

In [None]:
?copy