scipy/sparse/csgraph/_traversal.pyx

"""
Routines for traversing graphs in compressed sparse format
"""

# Author: Jake Vanderplas  -- <vanderplas@astro.washington.edu>
# License: BSD, (C) 2012

import numpy as np
cimport numpy as np

from scipy.sparse import csr_matrix, isspmatrix, isspmatrix_csr, isspmatrix_csc
from scipy.sparse.csgraph._validation import validate_graph
from scipy.sparse.csgraph._tools import reconstruct_path

cimport cython
from libc cimport stdlib

include 'parameters.pxi'

def connected_components(csgraph, directed=True, connection='weak',
                         return_labels=True):
    """
    connected_components(csgraph, directed=True, connection='weak',
                         return_labels=True)

    Analyze the connected components of a sparse graph

    .. versionadded:: 0.11.0

    Parameters
    ----------
    csgraph : array_like or sparse matrix
        The N x N matrix representing the compressed sparse graph.  The input
        csgraph will be converted to csr format for the calculation.
    directed : bool, optional
        If True (default), then operate on a directed graph: only
        move from point i to point j along paths csgraph[i, j].
        If False, then find the shortest path on an undirected graph: the
        algorithm can progress from point i to j along csgraph[i, j] or
        csgraph[j, i].
    connection : str, optional
        ['weak'|'strong'].  For directed graphs, the type of connection to
        use.  Nodes i and j are strongly connected if a path exists both
        from i to j and from j to i.  Nodes i and j are weakly connected if
        only one of these paths exists.  If directed == False, this keyword
        is not referenced.
    return_labels : str, optional
        If True (default), then return the labels for each of the connected
        components.

    Returns
    -------
    n_components: int
        The number of connected components.
    labels: ndarray
        The length-N array of labels of the connected components.
    """
    if connection.lower() not in ['weak', 'strong']:
        raise ValueError("connection must be 'weak' or 'strong'")
    
    # weak connections <=> components of undirected graph
    if connection.lower() == 'weak':
        directed = False

    csgraph = validate_graph(csgraph, directed,
                             dense_output=False)

    labels = np.empty(csgraph.shape[0], dtype=ITYPE)
    labels.fill(NULL_IDX)

    if directed:
        n_components = _connected_components_directed(csgraph.indices,
                                                      csgraph.indptr,
                                                      labels)
    else:
        csgraph_T = csgraph.T.tocsr()
        n_components = _connected_components_undirected(csgraph.indices,
                                                        csgraph.indptr,
                                                        csgraph_T.indices,
                                                        csgraph_T.indptr,
                                                        labels)

    if return_labels:
        return n_components, labels
    else:
        return n_components
    

def breadth_first_tree(csgraph, i_start, directed=True):
    r"""
    breadth_first_tree(csgraph, i_start, directed=True)

    Return the tree generated by a breadth-first search

    Note that a breadth-first tree from a specified node is unique.

    .. versionadded:: 0.11.0

    Parameters
    ----------
    csgraph : array_like or sparse matrix
        The N x N matrix representing the compressed sparse graph.  The input
        csgraph will be converted to csr format for the calculation.
    i_start : int
        The index of starting node.
    directed : bool, optional
        If True (default), then operate on a directed graph: only
        move from point i to point j along paths csgraph[i, j].
        If False, then find the shortest path on an undirected graph: the
        algorithm can progress from point i to j along csgraph[i, j] or
        csgraph[j, i].

    Returns
    -------
    cstree : csr matrix
        The N x N directed compressed-sparse representation of the breadth-
        first tree drawn from csgraph, starting at the specified node.

    Examples
    --------
    The following example shows the computation of a depth-first tree
    over a simple four-component graph, starting at node 0::

         input graph          breadth first tree from (0)

             (0)                         (0)
            /   \                       /   \
           3     8                     3     8
          /       \                   /       \
        (3)---5---(1)               (3)       (1)
          \       /                           /
           6     2                           2
            \   /                           /
             (2)                         (2)

    In compressed sparse representation, the solution looks like this:

    >>> from scipy.sparse import csr_matrix
    >>> from scipy.sparse.csgraph import breadth_first_tree
    >>> X = csr_matrix([[0, 8, 0, 3],
    ...                 [0, 0, 2, 5],
    ...                 [0, 0, 0, 6],
    ...                 [0, 0, 0, 0]])
    >>> Tcsr = breadth_first_tree(X, 0, directed=False)
    >>> Tcsr.toarray().astype(int)
    array([[0, 8, 0, 3],
           [0, 0, 2, 0],
           [0, 0, 0, 0],
           [0, 0, 0, 0]])

    Note that the resulting graph is a Directed Acyclic Graph which spans
    the graph.  A breadth-first tree from a given node is unique.
    """
    node_list, predecessors = breadth_first_order(csgraph, i_start,
                                                  directed, True)
    return reconstruct_path(csgraph, predecessors, directed)


def depth_first_tree(csgraph, i_start, directed=True):
    r"""
    depth_first_tree(csgraph, i_start, directed=True)

    Return a tree generated by a depth-first search.

    Note that a tree generated by a depth-first search is not unique:
    it depends on the order that the children of each node are searched.

    .. versionadded:: 0.11.0

    Parameters
    ----------
    csgraph : array_like or sparse matrix
        The N x N matrix representing the compressed sparse graph.  The input
        csgraph will be converted to csr format for the calculation.
    i_start : int
        The index of starting node.
    directed : bool, optional
        If True (default), then operate on a directed graph: only
        move from point i to point j along paths csgraph[i, j].
        If False, then find the shortest path on an undirected graph: the
        algorithm can progress from point i to j along csgraph[i, j] or
        csgraph[j, i].

    Returns
    -------
    cstree : csr matrix
        The N x N directed compressed-sparse representation of the depth-
        first tree drawn from csgraph, starting at the specified node.

    Examples
    --------
    The following example shows the computation of a depth-first tree
    over a simple four-component graph, starting at node 0::

         input graph           depth first tree from (0)

             (0)                         (0)
            /   \                           \
           3     8                           8
          /       \                           \
        (3)---5---(1)               (3)       (1)
          \       /                   \       /
           6     2                     6     2
            \   /                       \   /
             (2)                         (2)

    In compressed sparse representation, the solution looks like this:

    >>> from scipy.sparse import csr_matrix
    >>> from scipy.sparse.csgraph import depth_first_tree
    >>> X = csr_matrix([[0, 8, 0, 3],
    ...                 [0, 0, 2, 5],
    ...                 [0, 0, 0, 6],
    ...                 [0, 0, 0, 0]])
    >>> Tcsr = depth_first_tree(X, 0, directed=False)
    >>> Tcsr.toarray().astype(int)
    array([[0, 8, 0, 0],
           [0, 0, 2, 0],
           [0, 0, 0, 6],
           [0, 0, 0, 0]])

    Note that the resulting graph is a Directed Acyclic Graph which spans
    the graph.  Unlike a breadth-first tree, a depth-first tree of a given
    graph is not unique if the graph contains cycles.  If the above solution
    had begun with the edge connecting nodes 0 and 3, the result would have
    been different.
    """
    node_list, predecessors = depth_first_order(csgraph, i_start,
                                                directed, True)
    return reconstruct_path(csgraph, predecessors, directed)


def breadth_first_order(csgraph, i_start,
                        directed=True, return_predecessors=True):
    """
    breadth_first_order(csgraph, i_start, directed=True, return_predecessors=True)

    Return a breadth-first ordering starting with specified node.

    Note that a breadth-first order is not unique, but the tree which it
    generates is unique.

    .. versionadded:: 0.11.0

    Parameters
    ----------
    csgraph : array_like or sparse matrix
        The N x N compressed sparse graph.  The input csgraph will be
        converted to csr format for the calculation.
    i_start : int
        The index of starting node.
    directed : bool, optional
        If True (default), then operate on a directed graph: only
        move from point i to point j along paths csgraph[i, j].
        If False, then find the shortest path on an undirected graph: the
        algorithm can progress from point i to j along csgraph[i, j] or
        csgraph[j, i].
    return_predecessors : bool, optional
        If True (default), then return the predecesor array (see below).

    Returns
    -------
    node_array : ndarray, one dimension
        The breadth-first list of nodes, starting with specified node.  The
        length of node_array is the number of nodes reachable from the
        specified node.
    predecessors : ndarray, one dimension
        Returned only if return_predecessors is True.
        The length-N list of predecessors of each node in a breadth-first
        tree.  If node i is in the tree, then its parent is given by
        predecessors[i]. If node i is not in the tree (and for the parent
        node) then predecessors[i] = -9999.
    """
    global NULL_IDX
    csgraph = validate_graph(csgraph, directed, dense_output=False)
    cdef int N = csgraph.shape[0]

    cdef np.ndarray node_list = np.empty(N, dtype=ITYPE)
    cdef np.ndarray predecessors = np.empty(N, dtype=ITYPE)
    node_list.fill(NULL_IDX)
    predecessors.fill(NULL_IDX)

    if directed:
        length = _breadth_first_directed(i_start,
                                csgraph.indices, csgraph.indptr,
                                node_list, predecessors)
    else:
        csgraph_T = csgraph.T.tocsr()
        length = _breadth_first_undirected(i_start,
                                           csgraph.indices, csgraph.indptr,
                                           csgraph_T.indices, csgraph_T.indptr,
                                           node_list, predecessors)

    if return_predecessors:
        return node_list[:length], predecessors
    else:
        return node_list[:length]
    

cdef unsigned int _breadth_first_directed(
                           unsigned int head_node,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] node_list,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] predecessors):
    # Inputs:
    #  head_node: (input) index of the node from which traversal starts
    #  indices: (input) CSR indices of graph
    #  indptr:  (input) CSR indptr of graph
    #  node_list: (output) breadth-first list of nodes
    #  predecessors: (output) list of predecessors of nodes in breadth-first
    #                tree.  Should be initialized to NULL_IDX
    # Returns:
    #  n_nodes: the number of nodes in the breadth-first tree
    global NULL_IDX

    cdef unsigned int i, pnode, cnode
    cdef unsigned int i_nl, i_nl_end
    cdef unsigned int N = node_list.shape[0]

    node_list[0] = head_node
    i_nl = 0
    i_nl_end = 1

    while i_nl < i_nl_end:
        pnode = node_list[i_nl]

        for i from indptr[pnode] <= i < indptr[pnode + 1]:
            cnode = indices[i]
            if (cnode == head_node):
                continue
            elif (predecessors[cnode] == NULL_IDX):
                node_list[i_nl_end] = cnode
                predecessors[cnode] = pnode
                i_nl_end += 1

        i_nl += 1

    return i_nl
    

cdef unsigned int _breadth_first_undirected(
                           unsigned int head_node,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices1,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr1,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices2,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr2,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] node_list,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] predecessors):
    # Inputs:
    #  head_node: (input) index of the node from which traversal starts
    #  indices1: (input) CSR indices of graph
    #  indptr1:  (input) CSR indptr of graph
    #  indices2: (input) CSR indices of transposed graph
    #  indptr2:  (input) CSR indptr of transposed graph
    #  node_list: (output) breadth-first list of nodes
    #  predecessors: (output) list of predecessors of nodes in breadth-first
    #                tree.  Should be initialized to NULL_IDX
    # Returns:
    #  n_nodes: the number of nodes in the breadth-first tree
    global NULL_IDX

    cdef unsigned int i, pnode, cnode
    cdef unsigned int i_nl, i_nl_end
    cdef unsigned int N = node_list.shape[0]

    node_list[0] = head_node
    i_nl = 0
    i_nl_end = 1

    while i_nl < i_nl_end:
        pnode = node_list[i_nl]

        for i from indptr1[pnode] <= i < indptr1[pnode + 1]:
            cnode = indices1[i]
            if (cnode == head_node):
                continue
            elif (predecessors[cnode] == NULL_IDX):
                node_list[i_nl_end] = cnode
                predecessors[cnode] = pnode
                i_nl_end += 1

        for i from indptr2[pnode] <= i < indptr2[pnode + 1]:
            cnode = indices2[i]
            if (cnode == head_node):
                continue
            elif (predecessors[cnode] == NULL_IDX):
                node_list[i_nl_end] = cnode
                predecessors[cnode] = pnode
                i_nl_end += 1

        i_nl += 1

    return i_nl


def depth_first_order(csgraph, i_start,
                      directed=True, return_predecessors=True):
    """
    depth_first_order(csgraph, i_start, directed=True, return_predecessors=True)

    Return a depth-first ordering starting with specified node.

    Note that a depth-first order is not unique.  Furthermore, for graphs
    with cycles, the tree generated by a depth-first search is not
    unique either.

    .. versionadded:: 0.11.0

    Parameters
    ----------
    csgraph : array_like or sparse matrix
        The N x N compressed sparse graph.  The input csgraph will be
        converted to csr format for the calculation.
    i_start : int
        The index of starting node.
    directed : bool, optional
        If True (default), then operate on a directed graph: only
        move from point i to point j along paths csgraph[i, j].
        If False, then find the shortest path on an undirected graph: the
        algorithm can progress from point i to j along csgraph[i, j] or
        csgraph[j, i].
    return_predecessors : bool, optional
        If True (default), then return the predecesor array (see below).

    Returns
    -------
    node_array : ndarray, one dimension
        The breadth-first list of nodes, starting with specified node.  The
        length of node_array is the number of nodes reachable from the
        specified node.
    predecessors : ndarray, one dimension
        Returned only if return_predecessors is True.
        The length-N list of predecessors of each node in a breadth-first
        tree.  If node i is in the tree, then its parent is given by
        predecessors[i]. If node i is not in the tree (and for the parent
        node) then predecessors[i] = -9999.
    """
    global NULL_IDX
    csgraph = validate_graph(csgraph, directed, dense_output=False)
    cdef int N = csgraph.shape[0]

    node_list = np.empty(N, dtype=ITYPE)
    predecessors = np.empty(N, dtype=ITYPE)
    root_list = np.empty(N, dtype=ITYPE)
    flag = np.zeros(N, dtype=ITYPE)
    node_list.fill(NULL_IDX)
    predecessors.fill(NULL_IDX)
    root_list.fill(NULL_IDX)

    if directed:
        length = _depth_first_directed(i_start,
                              csgraph.indices, csgraph.indptr,
                              node_list, predecessors,
                              root_list, flag)
    else:
        csgraph_T = csgraph.T.tocsr()
        length = _depth_first_undirected(i_start,
                                         csgraph.indices, csgraph.indptr,
                                         csgraph_T.indices, csgraph_T.indptr,
                                         node_list, predecessors,
                                         root_list, flag)

    if return_predecessors:
        return node_list[:length], predecessors
    else:
        return node_list[:length]
    

cdef unsigned int _depth_first_directed(
                           unsigned int head_node,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] node_list,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] predecessors,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] root_list,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] flag):
    cdef unsigned int i, j, i_nl_end, cnode, pnode
    cdef unsigned int N = node_list.shape[0]
    cdef int no_children, i_root

    node_list[0] = head_node
    root_list[0] = head_node
    i_root = 0
    i_nl_end = 1
    flag[head_node] = 1

    while i_root >= 0:
        pnode = root_list[i_root]
        no_children = True
        for i from indptr[pnode] <= i < indptr[pnode + 1]:
            cnode = indices[i]
            if flag[cnode]:
                continue
            else:
                i_root += 1
                root_list[i_root] = cnode
                node_list[i_nl_end] = cnode
                predecessors[cnode] = pnode
                flag[cnode] = 1
                i_nl_end += 1
                no_children = False
                break

        if i_nl_end == N:
            break
        
        if no_children:
            i_root -= 1
    
    return i_nl_end
    

cdef unsigned int _depth_first_undirected(
                           unsigned int head_node,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices1,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr1,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices2,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr2,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] node_list,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] predecessors,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] root_list,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] flag):
    cdef unsigned int i, j, i_nl_end, cnode, pnode
    cdef unsigned int N = node_list.shape[0]
    cdef int no_children, i_root

    node_list[0] = head_node
    root_list[0] = head_node
    i_root = 0
    i_nl_end = 1
    flag[head_node] = 1

    while i_root >= 0:
        pnode = root_list[i_root]
        no_children = True

        for i from indptr1[pnode] <= i < indptr1[pnode + 1]:
            cnode = indices1[i]
            if flag[cnode]:
                continue
            else:
                i_root += 1
                root_list[i_root] = cnode
                node_list[i_nl_end] = cnode
                predecessors[cnode] = pnode
                flag[cnode] = 1
                i_nl_end += 1
                no_children = False
                break

        if no_children:
            for i from indptr2[pnode] <= i < indptr2[pnode + 1]:
                cnode = indices2[i]
                if flag[cnode]:
                    continue
                else:
                    i_root += 1
                    root_list[i_root] = cnode
                    node_list[i_nl_end] = cnode
                    predecessors[cnode] = pnode
                    flag[cnode] = 1
                    i_nl_end += 1
                    no_children = False
                    break

        if i_nl_end == N:
            break
        
        if no_children:
            i_root -= 1
    
    return i_nl_end


cdef int _connected_components_directed(
                                 np.ndarray[ITYPE_t, ndim=1, mode='c'] indices,
                                 np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr,
                                 np.ndarray[ITYPE_t, ndim=1, mode='c'] labels):
    """
    Uses an iterative version of Tarjan's algorithm to find the
    strongly connected components of a directed graph represented as a
    sparse matrix (scipy.sparse.csc_matrix or scipy.sparse.csr_matrix).

    The algorithmic complexity is for a graph with E edges and V
    vertices is O(E + V).
    The storage requirement is 2*V integer arrays.

    Uses an iterative version of the algorithm described here:
    http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.102.1707
    """
    cdef int v, w, index, low_v, low_w, label, j
    cdef int SS_head, root, stack_head, f, b
    cdef int VOID = -1
    cdef int END = -2
    cdef int N = labels.shape[0]
    cdef np.ndarray[ITYPE_t, ndim=1, mode="c"] SS, lowlinks, stack_f, stack_b

    lowlinks = labels
    SS = np.ndarray((N,), dtype=ITYPE)
    stack_b = np.ndarray((N,), dtype=ITYPE)
    stack_f = SS

    # The stack of nodes which have been backtracked and are in the current SCC
    SS.fill(VOID)
    SS_head = END

    # The array containing the lowlinks of nodes not yet assigned an SCC. Shares
    # memory with the labels array, since they are not used at the same time.
    lowlinks.fill(VOID)

    # The DFS stack. Stored with both forwards and backwards pointers to allow
    # us to move a node up to the top of the stack, as we only need to visit
    # each node once. stack_f shares memory with SS, as nodes aren't put on the
    # SS stack until after they've been popped from the DFS stack.
    stack_head = END
    stack_f.fill(VOID)
    stack_b.fill(VOID)

    index = 0
    # Count SCC labels backwards so as not to class with lowlinks values.
    label = N - 1
    for v in range(N):
        if lowlinks[v] == VOID:
            # DFS-stack push
            stack_head = v
            stack_f[v] = END
            stack_b[v] = END
            while stack_head != END:
                v = stack_head
                if lowlinks[v] == VOID:
                    lowlinks[v] = index
                    index += 1

                    # Add successor nodes
                    for j from indptr[v] <= j < indptr[v+1]:
                        w = indices[j]
                        if lowlinks[w] == VOID:
                            # DFS-stack push
                            if stack_f[w] != VOID:
                                # w is already inside the stack, so excise it.
                                f = stack_f[w]
                                b = stack_b[w]
                                if b != END:
                                    stack_f[b] = f
                                if f != END:
                                    stack_b[f] = b

                            stack_f[w] = stack_head
                            stack_b[w] = END
                            stack_b[stack_head] = w
                            stack_head = w

                else:
                    # DFS-stack pop
                    stack_head = stack_f[v]
                    if stack_head >= 0:
                        stack_b[stack_head] = END
                    stack_f[v] = VOID
                    stack_b[v] = VOID

                    root = 1 # True
                    low_v = lowlinks[v]
                    for j from indptr[v] <= j < indptr[v+1]:
                        low_w = lowlinks[indices[j]]
                        if low_w < low_v:
                            low_v = low_w
                            root = 0 # False
                    lowlinks[v] = low_v

                    if root: # Found a root node
                        index -= 1
                        # while S not empty and rindex[v] <= rindex[top[S]
                        while SS_head != END and lowlinks[v] <= lowlinks[SS_head]:
                            w = SS_head        # w = pop(S)
                            SS_head = SS[w]
                            SS[w] = VOID

                            labels[w] = label  # rindex[w] = c
                            index -= 1         # index = index - 1
                        labels[v] = label  # rindex[v] = c
                        label -= 1         # c = c - 1
                    else:
                        SS[v] = SS_head  # push(S, v)
                        SS_head = v

    # labels count down from N-1 to zero. Modify them so they
    # count upward from 0
    labels *= -1
    labels += (N - 1)
    return (N - 1) - label

cdef int _connected_components_undirected(
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices1,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr1,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indices2,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] indptr2,
                           np.ndarray[ITYPE_t, ndim=1, mode='c'] labels):

    cdef int v, w, j, label, SS_head
    cdef int N = labels.shape[0]
    cdef int VOID = -1
    cdef int END = -2
    labels.fill(VOID)
    label = 0

    # Share memory for the stack and labels, since labels are only
    # applied once a node has been popped from the stack.
    cdef np.ndarray[ITYPE_t, ndim=1, mode="c"] SS = labels
    SS_head = END
    for v in range(N):
        if labels[v] == VOID:
            # SS.push(v)
            SS_head = v
            SS[v] = END

            while SS_head != END:
                # v = SS.pop()
                v = SS_head
                SS_head = SS[v]

                labels[v] = label

                # Push children onto the stack if they havn't been
                # seen at all yet.
                for j from indptr1[v] <= j < indptr1[v+1]:
                    w = indices1[j]
                    if SS[w] == VOID:
                        SS[w] = SS_head
                        SS_head = w
                for j from indptr2[v] <= j < indptr2[v+1]:
                    w = indices2[j]
                    if SS[w] == VOID:
                        SS[w] = SS_head
                        SS_head = w
            label += 1

    return label