# Greedy Algorithm, Minimum Spanning Tree, Dynamic Programming

## Greedy algorithm

- Example: Dijkstra's shortest path algorithm
- easy to propose
- easy runtime analysis
- hard to eatablish correctness
- most greedy algorithms are not correct

In [None]:
def open_file(file_path):
    """
    Read-in a file containing rows with weight and length, and compute difference and ratio
    
    Args:
    file_path -- location of file to read
    
    Returns:
    data_array -- an array of tuples 
    """
    
    data_array = []
    running_sum = 0
    
    with open(file_path, 'r') as line:
        array_of_array = line.read().split("\n")
        del array_of_array[0] # delete first element, which is just the length of data
        for array in array_of_array:
            subarray = array.split(" ")
            weight = int(subarray[0])
            length = int(subarray[1])
            difference = weight - length
            ratio = weight / length
            data_array.append((weight, length, difference, ratio))
    return data_array


array = open_file("data/job.txt")
# array = open_file("data/job-test1.txt")

sorted_array = sorted(array, key=lambda x: (x[2], x[0]), reverse = True) # sort by thrid element, them by first
# print(sorted_array) 
completion_time = 0
running_sum = 0
for item in sorted_array:
    running_sum += item[1]
    completion_time += item[0] * running_sum
print(completion_time)
# 69119377652

sorted_array = sorted(array, key=lambda x: (x[3], x[0]), reverse = True) # sort by fourth element, then by first
completion_time = 0
running_sum = 0
for item in sorted_array:
    running_sum += item[1]
    completion_time += item[0] * running_sum
print(completion_time)
# 67311454237

## Minimum spanning tree

- connect a bunch of points as cheaply as possible (minimum cost tree that spans all vertices)
- given undirected graph G=(V,E)

Prim's algorithm
- initialize $X = [s]$
- T = empty
- while X != V
    - let edge(u,v) be the cheapest edge with u in X and v not in X
    - add e to T
    - add v to X

In [None]:
def open_file(file_path):
    """
    Read-in a file containing rows with weight and length, and compute difference and ratio
    
    Args:
    file_path -- location of file to read
    
    Returns:
    data_array -- an array of tuplesrepresenting a graph
    """
    
    data_array = []
    num_nodes = 0
    
    with open(file_path, 'r') as line:
        array_of_array = line.read().split("\n")
        num_nodes = int(array_of_array[0].split(" ")[0]) 
        del array_of_array[0] # delete first element, which is just the length of data
        for array in array_of_array:
            subarray = array.split(" ")
            node1 = int(subarray[0])
            node2 = int(subarray[1])
            cost = int(subarray[2])
            data_array.append((node1, node2, cost))
    return (data_array, num_nodes)


def greedy_search(array, X, T):
    """
    For all node1 in X, find node2 that is not in X, that makes the cheapest edge between node1 and node2
    
    Args:
    array -- a list of tuples representing a graph
    X -- a list to store all vertices that consist minimun spanning tree
    T -- a list to store all costs of edges that consist minimun spanning tree
    
    Returns:
    None
    """
    
    minimum_cost = 1000000
    minimum_node1 = 0
    minimum_node2 = 0
    for node1 in X:
        for node2 in get_connected_node(node1, array):
            if node2 not in X:
                cost = get_cost(node1, node2, array)
                if cost < minimum_cost:
                    minimum_node1 = node1
                    minimum_node2 = node2
                    minimum_cost = cost
    
    X.append(minimum_node2)
    T.append(minimum_cost)
    
    
def get_connected_node(node1, array):
    """
    Find all nodes that are connected by an edge for node1
    
    Args:
    node1 -- input node
    array -- a list of tuples representing a graph
    """
    
    nodes = []
    
    for item in array:
        if item[0] == node1:
            nodes.append(item[1])
        elif item[1] == node1:
            nodes.append(item[0])
            
    return nodes


def get_cost(node1, node2, array):
    """
    Find cost of edge between node1 and node2
    
    Args:
    node1 -- first node of an edge
    node2 -- second node of an edge
    array -- a list of tuples representing a graph
    
    Returns:
    cost -- cost of edge between node1 and node2
    """
    
    cost = 0
    
    for item in array:
        if item[0] == node1 and item[1] == node2:
            cost = item[2]
        if item[0] == node2 and item[1] == node1:
            cost = item[2]
            
    return cost
            
    
tuple_obj = open_file("data/edge.txt")
# tuple_obj = open_file("data/edge-test1.txt") #7
# tuple_obj = open_file("data/edge-test2.txt") #15
# tuple_obj = open_file("data/edge-test3.txt") #14
array = tuple_obj[0]
num_nodes = tuple_obj[1]
s = array[0][0] # pick random node
X = [] # store explored nodes
X.append(s)
T = [] # store costs
T.append(0)

while len(X) < num_nodes:
    greedy_search(array, X, T)
    
print(sum(T))
# -3612829

## Kruskal's MST Algorithm

- sort edges in order of increasing cost (rename edges 1,2,3,... so that $c_{1} < c_{2} < \dots < c_{m}$)
- let T = empty set
- for i=1 to m
    - if T + {i} has no cycles
        - add i to T
- return T

Union-Find
- $Union(C_{i}, C_{j}$): fuse graph $C_{i}, C_{j}$ into a single one
- maintain one linked structure
- each vertex points to the leader of its component (none of a component inherited from leader vertex)
- given edge(u,v), can check if u and v are already in some component in $O(1)$ time (iff leader pointers of u and v match <=> Find(u) = Find(v)
- when new edge(u,v) added to T, connected components of u and v merge
- when two components merge, have smaller one inherit the leader of the larger one

Clustering
- given n points, classify into coherent groups
- initially, each point in a separate cluster
- repeat until only k clusters
    - let p,q = closest paif of separate points
    - merge the cluster containing p and q into a single cluster

In [None]:
def open_file(file_path):
    """
    Read-in a file containing rows with weight and length, and compute difference and ratio

    Args:
    file_path -- location of file to read

    Returns:
    data_array -- an array of tuplesrepresenting a graph
    """

    data_array = []
    num_nodes = 0

    with open(file_path, 'r') as line:
        array_of_array = line.read().split("\n")
        num_nodes = int(array_of_array[0].split(" ")[0])
        del array_of_array[0] # delete first element, which is just the length of data
        for array in array_of_array:
            subarray = array.split(" ")
            node1 = int(subarray[0])
            node2 = int(subarray[1])
            cost = int(subarray[2])
            data_array.append((node1, node2, cost))
    return (data_array, num_nodes)


def find_closest_pair_and_merge(sorted_array, T):
    """
    Find two nodes that are in different clusters, and merge them into a single cluster

    Args:
    sorted_array -- a list of tuple what is sorted by its thrid element (that is cost between two nodes)
    T -- a list of list that contains "clusers"

    Returns:
    None
    """

    node1 = sorted_array[0][0]
    node2 = sorted_array[0][1]
    cost = sorted_array[0][2]

    index_of_cluster_to_expand = find_cluster(node1, T)
    index_of_cluster_to_remove = find_cluster(node2, T)

    print(str(node1) + " and " + str(node2) + ": " + str(index_of_cluster_to_expand) + " => " + str(index_of_cluster_to_remove))

    if index_of_cluster_to_expand != index_of_cluster_to_remove: # if two nodes are already in the same cluster, no need to perform merge on T
        for node in T[index_of_cluster_to_remove]:
            T[index_of_cluster_to_expand].append(node) # add all nodes in the cluster where node2 belongs to node1's cluster
        del T[index_of_cluster_to_remove] # remove node2's cluster
        del sorted_array[0] # remove current tuple
    else:
        del sorted_array[0] # remove current tuple


def find_cluster(node, T):
    """
    Find a list inside T where node belongs

    Args:
    node -- an integer representing a node in a graph
    T -- a list of list that contains "clusers"

    Returns:
    i -- index of cluster of T
    """

    for i in range(0, len(T)):
        if node in T[i]:
            return i
    return -1


def get_max_spacing(T, sorted_array):
    """
    Return the minimum distance of two nodes that are in different clusters

    Args:
    sorted_array -- a list of tuple what is sorted by its thrid element (that is cost between two nodes)
    T -- a list of list that contains "clusers"

    Returns:
    item[2] -- the minimum cost
    """

    for item in sorted_array:
        cluster_of_node1 = find_cluster(item[0], T)
        cluster_of_node2 = find_cluster(item[1], T)
        if cluster_of_node1 != cluster_of_node2:
            return item[2]


tuple_obj = open_file("data/clustering.txt")
# tuple_obj = open_file("data/clustering-test1.txt")
array = tuple_obj[0]
sorted_array = sorted(array, key=lambda x: (x[2])) # sort by third element
num_nodes = tuple_obj[1]
print("len(array):" + str(len(sorted_array)))


T = []
for node in range(1, num_nodes+1):
    T.append([node])
print("len(T): " + str(len(T)))
print(T)

while len(T) > 4 and len(sorted_array) > 0:
    find_closest_pair_and_merge(sorted_array, T)

print(get_max_spacing(T, sorted_array))

# Max-spacing:100, two clusters: Nodes(1,2) Nodes(3,4,5)
# Max-spacing:105, four clusters

In [5]:
from networkx.utils.union_find import UnionFind


def open_file(file_path):
    """
    Read-in a file containing rows with weight and length, and compute difference and ratio

    Args:
    file_path -- location of file to read

    Returns:
    data_array -- an array of tuplesrepresenting a graph
    """

    data_dict = {}
    num_nodes = 0

    with open(file_path, 'r') as line:
        array_of_array = line.read().split("\n")
        num_nodes = int(array_of_array[0].split(" ")[0])
        num_bits = int(array_of_array[0].split(" ")[1])
        del array_of_array[0] # delete first element, which is just metadata
        for i in range(1, len(array_of_array)+1):
            data_dict[i] = []
            temp = array_of_array[i-1].split(" ")
            for bit in temp:
                data_dict[i].append(int(bit))
    return (data_dict, num_nodes, num_bits)


tuple_obj = open_file("data/clustering-big.txt")
data_dict = tuple_obj[0]
num_nodes = tuple_obj[1]
num_bits = tuple_obj[2]
print("len(data_dict): " + str(len(data_dict)))
print("num_nodes: " + str(num_nodes))
print("num_bits: " + str(num_bits))

ValueError: invalid literal for int() with base 10: ''