Question 1

clustering1.txt file describes a distance function (equivalently, a complete graph with edge costs).  It has the following format:

[number_of_nodes]

[edge 1 node 1] [edge 1 node 2] [edge 1 cost]

[edge 2 node 1] [edge 2 node 2] [edge 2 cost]

...

There is one edge (i,j)(i,j) for each choice of 1 <= i <= j <= n, where nn is the number of nodes.

For example, the third line of the file is "1 3 5250", indicating that the distance between nodes 1 and 3 (equivalently, the cost of the edge (1,3)) is 5250.  You can assume that distances are positive, but you should NOT assume that they are distinct.

Your task in this problem is to run the clustering algorithm from lecture on this data set, where the target number kk of clusters is set to 4.  What is the maximum spacing of a 4-clustering?


In [1]:
class UFnode:
    def __init__(self, node):
        """
            To represent node in Union Find class, which has info of its leader and size
        """
        self.value = node
        self.leader = node
        self.size = 1
       
    def __lt__(self, other):
        # to use for sorting 
        return self.value < other.value
    
    def __repr__(self):
        return f"{self.value}"
        


class Union_find:
    def __init__(self, size):
        # connected components, dictionary having key as leader and corresponding list of nodes that point to the leader
        self.clusters = {i: [i] for i in range(1, size + 1)}   
        self.count = size     # track the count of clusters, initiate as the total count of nodes
        self.nodes = {i: UFnode(i) for i in range(1, size + 1)}   # storage of nodes
        
    def get_size(self, node):
        # node is an int
        return self.nodes[node].size
    
    def find(self, node):
        # node is an int
        # returns leader vertex, since each vertex points to its leader and leader vertex points to itself
        return self.nodes[node].leader
    
    
    def union(self, node1, node2):
        """
            Merge two components that node 1 and node 2 are in, have the smaller one inherit the leader of the large one
            Then reduce the cluster count by 1
            node 1 and node 2 are ints
        """
        
        if self.get_size(node1) >= self.get_size(node2):
            larger_leader = self.find(node1)
            smaller_leader = self.find(node2)
            
                
        else:
            smaller_leader = self.find(node1)
            larger_leader = self.find(node2)
        
        # change leader pointer of nodes in smaller cluster to point to the leader of larger cluster, update size of larger 
        # cluster, and append nodes in smaller cluster to larger cluster, then delete the smaller cluster
            
        for node in self.clusters[smaller_leader]:
            self.nodes[node].leader = larger_leader
            self.nodes[larger_leader].size += 1
            self.clusters[larger_leader].append(node)
            
        del self.clusters[smaller_leader]
        
        # reduce one cluster count after union
        self.count -= 1
        
        
    def check_cycle(self, node1, node2):
        # if node 1 and node 2 have the same leader, meaning they are in the same cluster
        # node1 and node2 are ints
        return self.find(node1) == self.find(node2)

    
class Graph:
    def __init__(self, node_count):
        self.edges = []       # list of edges in the form of (cost, node1, node2)
        self.edge_count = 0
        self.node_count = node_count
    
    def max_space(self, k):
        
        """
            Algorithm to find the maximum spacing of k-clustering using Union Find Data Structure
        """
        
        self.edges.sort()
        
        uf = Union_find(self.node_count)
        
        if k > uf.count:
            return f"You need to pick a cluster count less than {uf.count}!"
        
        else:
            # merge clusters starting from the smallest edge, stop when there's k clusters remaining, 
            # that is before the final k - 1 clusters is formed
            for i in range(self.edge_count):
                cost, node1, node2 = self.edges[i]
                
                if not uf.check_cycle(node1, node2):
                    if uf.count == k:
                        return cost
                    else:
                        uf.union(node1, node2)
                    
                


def load(filename):
    """
        Load data in file to a graph using Graph class
    """
    with open(filename) as file:
        f = file.readlines()
        
        # extract the first line to be node_count
        node_count = f[0].strip()
        node_count = int(node_count)
        
        graph = Graph(node_count)
        
        # read the rest of lines to tuple of (cost, node1, node2) and add to the Graph.edges
        for lines in f[1:]:
            node1, node2, cost = lines.strip().rsplit(" ")
            graph.edges.append((int(cost), int(node1), int(node2)))
            graph.edge_count += 1
            
    return graph


if __name__ == "__main__":
    import time
    start = time.time()
    graph = load("clustering1.txt")
    print("The maximum spacing of a 4-clustering is: \n", graph.max_space(4))
    end = time.time()
    print()
    print(f"The run time of Max Spacing of k-Clusterings Algorithm is: {end-start} second(s).")


            
        

The maximum spacing of a 4-clustering is: 
 106

The run time of Max Spacing of k-Clusterings Algorithm is: 0.18409490585327148 second(s).


Question 2

In this question your task is again to run the clustering algorithm from lecture, but on a MUCH bigger graph.  So big, in fact, that the distances (i.e., edge costs) are only defined implicitly, rather than being provided as an explicit list.

The data set is in clustering_big.txt

The format is:

[# of nodes] [# of bits for each node's label]

[first bit of node 1] ... [last bit of node 1]

[first bit of node 2] ... [last bit of node 2]

...

For example, the third line of the file "0 1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1" denotes the 24 bits associated with node #2.

The distance between two nodes uu and vv in this problem is defined as the Hamming distance--- the number of differing bits --- between the two nodes' labels.  For example, the Hamming distance between the 24-bit label of node #2 above and the label "0 1 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 1" is 3 (since they differ in the 3rd, 7th, and 21st bits).

The question is: what is the largest value of k such that there is a k-clustering with spacing at least 3?  That is, how many clusters are needed to ensure that no pair of nodes with all but 2 bits in common get split into different clusters?



In [2]:
import re
import itertools as it
from networkx.utils import UnionFind

def load2(filename):
    """
        Load data in file
        returns:
        node_count: total count of nodes in the file
        bit_count: total count of bits
        ints_to_nodes: a map that has ints that converted from bits as key and node value as value
    """
    ints_to_nodes = {}
    with open(filename) as file:
        f = file.readlines()
        
        # extract the first line to be node_count
        node_count, bit_count = f[0].strip().rsplit(" ")
        node_count = int(node_count)
        bit_count = int(bit_count)
        
        # read the rest of lines to tuple of (cost, node1, node2) and add to the Graph.edges
        for i in range(1, len(f)):
            num = int(re.sub(r"[\n\t\s]*", "", f[i]), 2)
            try:
                ints_to_nodes[num] =+ (i,)
            except:
                ints_to_nodes[num] = (i,)
            
            
    return node_count, bit_count, ints_to_nodes


def create_bit_mask(n_bit):
    """
        To creat an array of bit-masks for the distances 0, 1, 2 by doing bit-shifts.
    """
    
    bit_mask = {0}
    
    # create bit_mask for distance as 1
    bit_mask.update([1 << i for i in range(n_bit)])
    
    # create bit_mask for distance as 2
    positions = list(it.combinations(range(n_bit), 2))            # create a list of position of 2 bits combination
    new_bits = [(1 << position1) + (1 << position2) for position1, position2 in positions]   #calculate the digits based on the list
    bit_mask.update(new_bits)
    
    return bit_mask
        
    
    

if __name__ == "__main__":
    
    """
        Basic idea is create a ints_to_nodes map and a bit_mask. Then use the keys in ints_to_nodes map and bit_mask to
        calculate the number = key ^ bit_mask, if the number is in map, union the nodes in the map for both number and the key. 
        After union all, the length of Union Find data structure is the maximum clustering
        
    """
    
    import time
    
    start = time.time()
    
    node_count, bit_count, ints_to_nodes = load2("clustering_big.txt")
    
    end = time.time()
    
    print("Time to read the file is: ", end-start)
    
    start = time.time()
    
    uf = UnionFind()
    bit_mask = []
    bit_mask = create_bit_mask(bit_count)
    
    end = time.time()
    
    print("Time to create bit mask is: ", end-start)
    
    start = time.time()
    
    for distance in bit_mask:
        for key1 in ints_to_nodes.keys():
            key2 = key1^distance
            try:
                uf.union(ints_to_nodes[key1], ints_to_nodes[key2])
            except:
                continue
                
    end = time.time()
    print()
    print("The largest value of k such that there is a k-clustering with spacing at least 3 is:\n", len(list(uf.to_sets())))
    print()
    print(f"The run time of the algorithm is: {end - start} second(s)")
                

    

Time to read the file is:  1.6737103462219238
Time to create bit mask is:  0.0

The largest value of k such that there is a k-clustering with spacing at least 3 is:
 6118

The run time of the algorithm is: 28.55481481552124 second(s)
