# Graph search, Shortest path, and Data structure

In [None]:
import math
import random
import collections

## Graph search

Goals
- find everything findable from a given start vertex
- don't explore anything twice
- $O(n+m)$ time

Generic algorithm
- given graph G with vertex s
- initially s explored, all other vertices unexplored
- while possible
    - choose an edge (u, v) with u explored and v unexplored (if none, halt)
    - mark v explored

BFS
- explore ndoes in layers
- can compute shortest path
- can compute connected components of undirected graph
- $O(n+m)$ using queue

BFS(Graph G, start vertex s)
- [all node initially unexplored]
- mark s as explored
- let Q = queue initialized with s
- while Q is not empty:
    - remove first node of Q, call it v
    - for each edge (v, w)
        - if w unexplored
            - mark w as explored
            - add w to Q (at the end)
            
Shortest path
- goal: compute dist(v), fewest # of edges from s to v
- assumption: every edge has length of 1 
- Extra code to BFS
    - initialize dist(v): 0 if v=s, large number if v != s
    - when considering edge (v,w)
        - if w unexplored, then set dist(w) = dist(v) + 1 
        
Undirected connectivity
- let G(V,E) undirected graph
- connected component = pieces of G
- goal: compute all connected components
- initalize: all nodes unexplored
- assume labelled 1 to n
- for i = 1 to n
    - if i not explored # in some previsou BFS
        - BFS(G, i) # discovers precisely i's connected component
     
DFS
- backtrack when only necessary
- can compute topological ordering & directed acyclic graph
- can compute connected components of directed graph
- $O(n+m)$ using stack

DFS(Graph G, start vertex s)
- mark s as explored
- for every edge (s, v)
    - if v is unexplored
        - DFS(G, v)
        
Topological ordering (straight forward)
- let v a sink vertex of G (every directed graph has a sink vertex)
- set f(v) = n
- recurse on G - {v}

Topological ordering (DFS)

    DFS-loop(Graph G) 
    - mark all nodes unexplored
    - current_label = n # keep track of ordering
    - for each vertext v in G
        - if v not explored
            - DFS(G,v)
    DFS(Graph G, start vertex s)
    - mark s as explored
    - for every edge (s, v)
        - if v is unexplored
            - DFS (G, v)
    - set f(s) = current_label
    - current_label--
    
Strongly connected components
- there exist path u->v and v->u in graph G

Kosaraju's two pass algorithm
- $O(m+n)$
- let G' = G with all arcs reversed
- run DFS_loop on G' (compute magical ordering of nodes)
- run DFS_loop on G (compute strongly connected component one by one)

DFS_loop(graph G)
- global variable t=0 # number of nodes processed so far
- global variable s=null # current source vertex
- assumes nodes labelled 1 to n
- for i = n to 1
    - if i not explored 
        - s = i
        - DFS(G, i)
        
DFS(graph G, node i)
- mark i as explored
- set leader(i) = node s
- for each arc (i,j) in G
    - if j not explored
        - DFS(G, j)
- t++
- set f(i) = t

In [None]:
def DFS_ordering(graph, node, explored_ordering):
    explored_ordering.append(node)
    for vertex in get_next(graph, node):
        if vertex not in explored_ordering:
            DFS_ordering(graph, vertex, explored_ordering)
    ordering.append(node)
            

def DFS_loop_ordering(graph, max_integer):
    i = max_integer
    while i > 0:
        if i not in explored_ordering:
            DFS_ordering(graph, i, explored_ordering)
        i = i - 1
    

def DFS_loop_computing(graph, max_integer):
    i = max_integer
    s[0] = 0
    while i > 0:
        if i not in explored_computing:
            s[0] = i
            DFS_computing(graph, i, explored_computing)
        i = i - 1


def DFS_computing(graph, node, explored_computing):
    explored_computing.append(node)
    leader.append(s[0])
    for vertex in get_next(graph, node):
        if vertex not in explored_computing:
            DFS_computing(graph, vertex, explored_computing)
            
            
def get_next(graph, node):
    vertices = []
    for arc in graph:
        if arc[0] == node:
            vertices.append(arc[1])
    return vertices
    
    
def compute_max(graph):
    temp_list = []
    for edge in graph:
        temp_list.append(max(edge[0], edge[1]))
    return max(temp_list)

        
def open_graph(file_path):  
    """
    Imports a file and stored data into a list of lists
    
    Args:
    file_path -- location of file
    
    Returns
    graph -- adjacency representation of graph (a list of lists)
    """
    
    graph = []
    
    with open(file_path, 'r') as line:
        array = line.read().split("\n")
        for subarray in array:
            graph.append(subarray.split(" "))
    
    for arc in graph:
        arc[0] = int(arc[0])
        arc[1] = int(arc[1])
        
    return graph
    
    
graph = open_graph("data/strongly-connected-component-test1.txt")
# graph = open_graph("data/strongly-connected-component-test2.txt")
# graph = open_graph("data/strongly-connected-component-test3.txt")
# graph = open_graph("data/strongly-connected-component-test4.txt")
# graph = open_graph("data/strongly-connected-component-test5.txt")
# graph = open_graph("data/strongly-connected-component.txt")


# Compute the magical ordering
ordering = []    
explored_ordering = []
DFS_loop_ordering(graph, compute_max(graph))
# print(ordering)


# Reverse direction of graph
for edge in graph:
    tmp = edge[0]
    edge[0] = edge[1]
    edge[1] = tmp
# print(graph)


# Change nodes based on magical ordering
for i in range(0, len(graph)):
    graph[i][0] = ordering.index(graph[i][0]) + 1
    graph[i][1] = ordering.index(graph[i][1]) + 1       
# print(graph)


# Compute the strongly connected components
leader = []
explored_computing = []
s = []
s.append(-1) # leaders in second path   
DFS_loop_computing(graph, compute_max(graph))
# print(leader)


# Show the result
counter = collections.Counter(leader)
print(counter.values())
print(counter.most_common(5))

## Shortest path (Dijkasta's Algorithm)

- initialize: $X = {s}$ # vertices processed so far
- $A[s] = 0$ # computed shortest path distances
- $B[s] = null$ # computed shortest path (actial path like a->b->c)
- while $X$ != $V$ # assume there are two sets $X$ and $V-X$ 
    - among all edges $(v,w)$ with $v$ in $X$, $w$ not in $X$, pick the one that minimizes $A[v]$ + $l_{vw}$ # call it $v^{*}, w^{*}$
    - add $w^{*}$ to $X$
    - set $A[w^{*}]$ = $A[v^{*}] + l_{v^{*}w^{*}}$
    - set $B[w^{*}]$ = $B[v^{*}] + (v^{*}, w^{*})$

In [None]:
def open_graph(file_path, delimiter):    
    """
    Imports a file and stored data into a list of lists
    
    Args:
    file_path -- location of file
    delimiter -- for each row, string will be broken by this delimiter and stored into a list
    
    Returns
    graph -- adjacency representation of graph (a list of lists)
    """
    
    graph = []
    
    with open(file_path, 'r') as line:
        array = line.read().split("\n")
        for subarray in array:
            graph.append(subarray.split(delimiter))
    
    return graph


def get_next(graph, source_vertex):
    """
    Get all possible outgoing vertices from a given vertex
    
    Args:
    source_vertex -- given vertex
    
    Returns:
    vertices -- a list contaning all possible outgoing vertices
    """
    
    vertices = []
    
    for elem in graph:
        if elem[0] == source_vertex:
            for i in range(1, len(elem)):
                vertices.append(elem[i].split(",")[0])
    
    return vertices


def get_all_outgoing_path(graph, X):
    """
    For all nodes in X, find outgoing vertices that are not in X
    
    Args: 
    graph -- adjacency representation of graph (a list of lists)
    X -- a list containing all explored vertices
    
    Returns
    candidates -- all outgoing vertices (that are not in X) for all vertices in X
    """
    
    candidates = []
    
    for vertex1 in X:
        outgoing = get_next(graph, vertex1)
        for vertex2 in outgoing:
            if vertex2 not in X:
                candidates.append((vertex1, vertex2))
            
    return candidates


def get_candidates_for_source_vertex(candidates, destination_vertex):
    """
    Find all source vertices whose outgoing vertices are the final destination vertex
    
    Args:
    candidates -- all outgoing vertices (that are not in X) for all vertices in X
    destination_vertex -- final destination vertex
    
    Returns:
    candidates_for_source_vertex -- a list containing all vertices whose outgoing vertices are the final destination vertex
    """

    candidates_for_source_vertex = []
    
    for edge in candidates:
        if edge[1] == destination_vertex:
            candidates_for_source_vertex.append(edge[0])
    return candidates_for_source_vertex


def get_cost(graph, source_vertex, destination_vertex):
    """
    Find cost between vertices
    
    Args:
    graph -- adjacency representation of graph (a list of lists)
    source_vertex -- a given vertex
    destination_vertex -- an outgoing vertex of given vertex
    
    Returns:
    cost -- cost between given and outgoing vertices
    """

    cost = -1
    for elem in graph:
        if elem[0] == source_vertex:
            for i in range(1, len(elem)):
                if elem[i].split(",")[0] == destination_vertex:
                    cost = elem[i].split(",")[1]
    return int(cost)


def pick_minimum(graph, A, X, candidates):    
    """
    For all outgoing vertices from X, find an outgoing vertex that would incur the minimum cost. Then update X and A
    
    Args:
    graph -- adjacency representation of graph (a list of lists)
    A -- a dictionary whose key represents a vertex and value represent the cost from the initial source vertex to that vertex
    X -- a list containing all explored vertices
    candidates -- all outgoing vertices (that are not in X) for all vertices in X
    
    Returns:
    None
    """
    
    minimum_distance = 1000000
    minimum_vertex2 = ""
    
    for candidate in candidates:
        vertex1 = candidate[0] # this vertex is in X
        vertex2 = candidate[1] # this vertex is not in X
        
        for elem in graph:
            if elem[0] == vertex1:
                for i in range(1, len(elem)):
                    if elem[i].split(",")[0] == vertex2:
                        index_of_vertex1 = X.index(vertex1)
                        if A[vertex1] + int(elem[i].split(",")[1]) < minimum_distance:
                            minimum_distance = A[vertex1] + int(elem[i].split(",")[1])
                            minimum_vertex2 = elem[i].split(",")[0]
    
    # print("minimum_vertex2: " + str(minimum_vertex2) + " and minimum_distance: " + str(minimum_distance))
    A[minimum_vertex2] = minimum_distance
    X.append(minimum_vertex2) 
    
                
def shortest_path(graph, source_vertex, destination_vertex, A, X, init=False):
    """
    Compute shortest path algorithm on a directed graph
    
    Args:
    graph -- adjacency representation of graph (a list of lists)
    source_vertex -- initial vertex to start from
    destination_vertex -- final vertex to end
    A -- a dictionary whose key represents a vertex and value represent the cost from the initial source vertex to that vertex
    X -- a list containing all explored vertices
    init -- flag to determin whether to initialize A and X
    
    Returns:
    A[destination_vertex] -- the cost from source vertex to final destination vertex  
    """

    if init:
        A[source_vertex] = 0 # distance from source_vertex to itself is 0
        X.append(source_vertex) # source_vertex is the first vertext in the set

    candidates = get_all_outgoing_path(graph, X)
    if destination_vertex not in X and len(X) < len(graph):
        pick_minimum(graph, A, X, candidates)
        shortest_path(graph, X[-1], destination_vertex, A, X)
    if destination_vertex not in X: # if there is no path between the original source and destination
        A[destination_vertex] = 1000000
    return A[destination_vertex]


# graph = open_graph("data/shortest-path-test1.txt", " ")
# print(shortest_path(graph, "7", "1", {}, [], True))
# print(shortest_path(graph, "1", "0", {}, [], True))
# print(shortest_path(graph, "2", "1", {}, [], True))
# print(shortest_path(graph, "1", "5", {}, [], True))

graph = open_graph("data/shortest-path.txt", "\t")
for i in range(0, len(graph)):
    del graph[i][-1]
del graph[-1]
print(shortest_path(graph, "1", "7", {}, [], True))
print(shortest_path(graph, "1", "37", {}, [], True))
print(shortest_path(graph, "1", "59", {}, [], True))
print(shortest_path(graph, "1", "82", {}, [], True))
print(shortest_path(graph, "1", "99", {}, [], True))
print(shortest_path(graph, "1", "115", {}, [], True))
print(shortest_path(graph, "1", "133", {}, [], True))
print(shortest_path(graph, "1", "165", {}, [], True))
print(shortest_path(graph, "1", "188", {}, [], True))
print(shortest_path(graph, "1", "197", {}, [], True))
# 2599
# 2610
# 2947
# 2052
# 2367
# 2399
# 2029
# 2442
# 2505
# 3068

## Heap (a.k.a priority queue)

Containers for objects that have keys
- Insert: add a new object to heap $O(nlogn)$
- Extract: remove an object with minimum key valye $O(nlogn)$
- Heapify: $O(n)$
- Delete: $O(nlogn)$

Heapsort: Insert all into heap and extract minimum one by one $O(nlogn)$

Property
- at every node x, key[x] <= all keys in x's children
- object at root must have minimum key-value

Implementation
- Put node in the tree into array layer by layer
- parent(i) = i/2 if i is even, floor(i/2) if i is odd
- children(i) = 2*i and 2*i + 1

Insert
- stick k at the end of last level
- bubble-up k until k's parent <= k

Extract
- delete root
- move last node to new root
- bubble-down k until k's parent <= k

## Sorted array

- Search: $O(logn)$
- Select: $O(1)$
- Min/Max: $O(1)$
- Insertion/Deletion: $O(n)$

## Balanced search tree

- Search: $O(logn)$
- Select: $O(logn)$
- Min/Max: $O(logn)$
- Insertion/Deletion: $O(logn)$

## Binary search tree structure

- exactly one node per key
- each node has
    - left child pointer
    - right child pointer
    - parent
- all nodes left on node X are less than X
- all nodes right on node X are greater than X
- many possible trees for a set of keys
- generally operations are $O(height)$

Search
- move down left or right

Insert
- do search (which will return NULL)
- add pointer to NULL

Min/Max
- follow left (min case) / right (max case) until the bottom

Inorder traversal
- let r = toor, Tr = right subtree, Tl = left subtree
- recurse on Tl
- print out root
- recurse on Tr

Delete
- search for k
- if k has no children -> delete k
- k has one child -> delete k, and put child under k's parent
- k has two children -> computer k's predecessor l, swap k and l, delete k

Select (ith order statistic)
- start at root x, with children y and z
- let a = size(y) # 0 if no left child
- if a = i-1, return x's key
- if a >= i, recurse to compute ith order statistic on new root y
- if a < i-1, recurse to compute (i-a-1)th order statistic on new root z

## Balanced search tree 

- ensure that heights are $O(logn)$
- Example: red-black tree, AVL, splay tree, B tree

## Red-Black tree

- each node red or black
- root is black
- no 2 reds in a row [red node => only black children]
- every root-null path (unsuccessful search) has the same number of black nodes

In [None]:
def open_file(file_path):
    """
    Read file line by line
    
    Args:
    file_path -- location of file to be read
    
    Returns:
    array -- list whose element is each line in the input file
    """
    
    array = []
    
    with open(file_path, 'r') as line:
        array = line.read().split("\n")
        
    return array


def adjust_two_heaps(low_heap, high_heap):
    """
    Adjusts two heaps such that lower half of entire set is in low_heap and upper half is in high_heap
    
    Args:
    low_heap -- a list representing a heap data structure
    high_heap -- a list representing a heap data structure
    
    Returns:
    None
    """
    
    max_low_heap = max(low_heap)
    min_high_heap = min(high_heap)
    
    if max_low_heap > min_high_heap:
        min_high_heap_index = high_heap.index(min_high_heap)
        del high_heap[min_high_heap_index]
        high_heap.append(max_low_heap)
        
        max_low_heap_index = low_heap.index(max_low_heap)
        del low_heap[max_low_heap_index]
        low_heap.append(min_high_heap)
        
        adjust_two_heaps(low_heap, high_heap)
        
        
def convert_string_to_int(array):
    """
    Convert all element in the input list from string to int
    
    Args:
    array - a list whose elements are type string
    
    Returns:
    None
    """
    
    for i in range(0, len(array)):
        array[i] = int(array[i])
        
    
def compute_running_median(array):
    """
    Computes runnign median of input list
    
    Args:
    array -- a list containing numbers
    
    Returns:
    median -- a list containing running medians of input list
    """
    
    low_heap = []
    high_heap = []
    median = []
    
    for i in array:
        if len(low_heap) > len(high_heap):
            high_heap.append(i)
        else:
            low_heap.append(i)

        if len(high_heap) > 0 and len(low_heap) > 0:
            adjust_two_heaps(low_heap, high_heap)

    #     print(array[:array.index(i)+1])
        if len(high_heap) == 0:
    #         print(low_heap[0])
            median.append(low_heap[0])

        if len(low_heap) == 0:
    #         print(high_heap[0])
            median.append(high_heap[0])

        if len(high_heap) > 0 and len(low_heap) > 0:
    #         print(str(min(high_heap)) + " vs " + str(max(low_heap)))
    #         print(str(max(low_heap)))
            median.append(str(max(low_heap)))
        
    return median
    

array = open_file("data/median-maintenance.txt")
convert_string_to_int(array)
# array = open_file("data/median-maintenance-test1.txt")
# array = [1,666,10,667,100,2,3]
# array = [6331, 2793, 1640, 9290, 225, 625, 6195, 2303, 5685, 1354]
# array = [78, 71, 99, 9, 24]
median = compute_running_median(array)
convert_string_to_int(median)
print(sum(median) % 10000)
# 1213

## Hash table

- really a dictionary (w/o ordering elements)
- Insert & Delete & Lookup: $O(1)$

Implementation
- chaining:
    - keep linkedlist in each bucket
    - given a key/object x, perform insert/delete/lookup in the list in $A[h(x)]$

Good hash function
- spread data out
- easy to store
- fast to evaluate


In [None]:
def open_file(file_path):
    """
    Read file line by line
    
    Args:
    file_path -- location of file to be read
    
    Returns:
    array -- list whose element is each line in the input file
    """
    
    array = []
    
    with open(file_path, 'r') as line:
        array = line.read().split("\n")
        
    return array


def convert_string_to_int(array):
    """
    Convert all element in the input list from string to int
    
    Args:
    array - a list whose elements are type string
    
    Returns:
    None
    """
    
    for i in range(0, len(array)):
        array[i] = int(array[i])
        
        
def two_sum(array, t):
    
    result = []
    
    for x in array:
        if t-x in array:
            result.append(t-x)
#             print(array.index(t-x))

    return result
        
        
array = open_file("data/two-sum.txt")
print(len(array))
convert_string_to_int(array)


print(two_sum(array, -10000))

# ret = 0
# for t in range(-10000,10001):
#     result = two_sum(array, t)
#     if len(result) == 1:
#         ret += 1