In [1]:
import sys
sys.path.append('.')
sys.path.append('..')
from problem_loader import ProblemLoader

data_urls = {
    'problem1': 'https://d18ky98rnyall9.cloudfront.net/_fe8d0202cd20a808db6a4d5d06be62f4_clustering1.txt?Expires=1625961600&Signature=HM33xWd1vNYzke9ri~OBn-cBif-ezXnyGgNjFA4fmSj3iV54RB2VkM4sgc~Fq0~9H6-564AearxvgEQQUw2eErmoG92sp~SpFo8H7xUzxQvgZp0gWrOzUg0eEO2CNEpkJiMeWvr6x5SxUqYDb9i8-PEoHu11NAwdYtvoJHTP7hc_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A',
    'problem2': 'https://d18ky98rnyall9.cloudfront.net/_fe8d0202cd20a808db6a4d5d06be62f4_clustering_big.txt?Expires=1626998400&Signature=I8EperrYsjtCtEPVqS1B0W9V7ykOER5VsaYG7Xug5hclTKrqzaMKvlLVjBxHbUyGsIR-tD4kgw39e55q799aW3u8dEstSxeKY2H3MK5PfALsz10lZPsvx9APjHHzi9MJIC8K1cZHSSmBA6tNkzlj7yn6CJuoqEJDgqS9TrQXNfo_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A',
}

## Problem 1
In this programming problem and the next you'll code up the clustering algorithm from lecture for computing a max-spacing $k$-clustering.

This file describes a distance function (equivalently, a complete graph with edge costs).  It has the following format:

[number_of_nodes]

[edge 1 node 1] [edge 1 node 2] [edge 1 cost]

[edge 2 node 1] [edge 2 node 2] [edge 2 cost]

...

There is one edge $(i,j)$ for each choice of $1 \leq i \lt j \leq n$, where $n$ is the number of nodes.

For example, the third line of the file is "1 3 5250", indicating that the distance between nodes 1 and 3 (equivalently, the cost of the edge $(1,3)$) is 5250.  You can assume that distances are positive, but you should NOT assume that they are distinct.

Your task in this problem is to run the clustering algorithm from lecture on this data set, where the target number $k$ of clusters is set to $4$.  What is the maximum spacing of a 4-clustering?

In [2]:
from helpers import Edge, flatten, process_weighted_edges, undirected_graph_of_weighted_edges

values = ProblemLoader(
    data_urls['problem1'], 
    fname="weighted-graph.p", 
    preprocessor=process_weighted_edges
).fetch()
print(values[:10])

[Edge(left=1, right=2, cost=6808), Edge(left=1, right=3, cost=5250), Edge(left=1, right=4, cost=74), Edge(left=1, right=5, cost=3659), Edge(left=1, right=6, cost=8931), Edge(left=1, right=7, cost=1273), Edge(left=1, right=8, cost=7545), Edge(left=1, right=9, cost=879), Edge(left=1, right=10, cost=7924), Edge(left=1, right=11, cost=7710)]


### Kruskal (Union-Find-Based)  

#### Input: 
connected undirected graph G = (V,E) in  adjacency-list representation and a cost ce for each  edge e 2 E.  
#### Output: 
the edges of a minimum spanning tree of G.  

`// Initialization`  
$T := \empty$  
$U := Initialize(V)$ `// union-find data structure`  
sort edges of $E$ by cost `// e.g., using MergeSort`  
`// Main loop  `
for each $(v, w) \in E$, in nondecreasing order of cost do  
&nbsp;  if $Find(U, v) \neq Find(U, w)$ then  
&nbsp;&nbsp;    `// no v-w path in T, so OK to add (v, w)`  
&nbsp;&nbsp;    $T := T \bigcup {(v, w)}$  
&nbsp;&nbsp;    `// update due to component fusion` 
&nbsp;&nbsp;    $Union(U, v, w)$  
return $T$ 

In [3]:
from deehzee_unionfind import UnionFind

def Kruskal_MST(graph):
  T = set()
  U = UnionFind(graph.keys()) # vertices
  E = [ Edge(left=key, right=edge.to, cost=edge.cost) 
    for key, edge_list in graph.items() 
      for edge in edge_list ]
  E.sort(key=lambda x: x.cost)
  for edge in E:
    if U.find(edge.left) != U.find(edge.right):
      T.add(edge)
      U.union(edge.left, edge.right)
  return T


### approach

#### Bottom-Up Clustering 

#### Input: 
----
a set $X$ of data points, a symmetric similarity function $f$, and a positive integer $k \in {1, 2, 3,..., |X|}$.  
#### Output: 
----
a partition of $X$ into $k$ non-empty sets.  

$C := \empty$ `// keeps track of current clusters`  
for each $x \in X$ do  
&nbsp;  add ${x}$ to $C$ `// each point in own cluster`  
`// Main loop`  
while $C$ contains more than $k$ clusters do  
&nbsp;  remove from $C$ the clusters $S1, S2$ that minimize  $F(S1, S2)$ `// e.g., satisfying` $\color{darkred}F(S1, S2) = \min\limits_{x \in S_1,y \in S_2} f(x, y)$.  
&nbsp;  add $S1 \bigcup S2$ to $C$ `// merge clusters`  
return $C$ 



In [4]:
from math import inf

def one(X, k=inf, E=None):
  U = UnionFind(list(X.keys())) # vertices
  if E is None:
    E = [ Edge(left=key, right=edge.to, cost=edge.cost) 
      for key, edge_list in X.items() 
        for edge in edge_list ]
  E.sort(key=lambda x: x.cost)
  
  for edge in E:
    #print(edge, U.components())
    if U.find(edge.left) != U.find(edge.right):
      if U.n_comps == k:
        #print("k found", edge, U.components())
        return edge.cost
      U.union(edge.left, edge.right)

  return inf

In [5]:
__test__ = False
if not __test__:
  graph = undirected_graph_of_weighted_edges(values)
  print(one(graph, k=4))
else:
  v_test = process_weighted_edges("""
1 2 1
1 3 4
1 4 5
1 5 10
1 6 11
1 7 12
2 3 3
2 4 4
2 5 9
2 6 10
2 7 11
3 4 1
3 5 6
3 6 7
3 7 8
4 5 5
4 6 6
4 7 7
5 6 1
5 7 2
6 7 1""".encode('utf-8'))
  g_test = undirected_graph_of_weighted_edges(v_test)
  k3 = one(g_test, k=3, E=v_test)
  assert k3 == 3, f"k3 is {k3} not 3"
  k2 = one(g_test, k=2, E=v_test)
  assert k2 == 5, "k2 is {k2} not 5"


106


## Problem 2
In this question your task is again to run the clustering algorithm from lecture, but on a MUCH bigger graph.  So big, in fact, that the distances (i.e., edge costs) are only defined implicitly, rather than being provided as an explicit list.

The format is:

[# of nodes] [# of bits for each node's label]

[first bit of node 1] ... [last bit of node 1]

[first bit of node 2] ... [last bit of node 2]

...

For example, the third line of the file "0 1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1" denotes the 24 bits associated with node #2.

The distance between two nodes $u$ and $v$ in this problem is defined as the Hamming distance – the number of differing bits – between the two nodes' labels.  For example, the Hamming distance between the 24-bit label of node #2 above and the label "0 1 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 1" is $3$ (since they differ in the 3rd, 7th, and 21st bits).

The question is: what is the largest value of $k$ such that there is a $k$-clustering with spacing at least $3$?  That is, how many clusters are needed to ensure that no pair of nodes with all but 2 bits in common get split into different clusters?

### NOTE: 
The graph implicitly defined by the data file is so big that you probably can't write it out explicitly, let alone sort the edges by cost.  So you will have to be a little creative to complete this part of the question.  For example, is there some way you can identify the smallest distances without explicitly looking at every pair of nodes?

In [6]:
from collections import defaultdict    

def process_hamming_codes(data):
    """decode the bytes to string. read all lines and convert the bit strings to integers."""
    hamming_codes = []
    for line in data.decode('utf-8').split('\n'):
        if len(line.split()) > 2:
            hamming_codes.append(int(line.replace(' ', ''), base=2))
        else:
            #print('tossing metadata', line)
            pass

    """convert hamming_codes to a dict by keys."""
    h = defaultdict(list)
    for i in range(len(hamming_codes)):
        h[hamming_codes[i]].append(i)

    return h

values = ProblemLoader(
    data_urls['problem2'], 
    fname="hamming.p", 
    preprocessor=process_hamming_codes
).fetch()
print(list(values.items())[:10])

[(14734287, [0]), (6709165, [1]), (7344869, [2]), (15449752, [3]), (5157860, [4]), (4854709, [5]), (1628832, [6]), (556504, [7]), (15091705, [8]), (8049727, [9])]


In [8]:

  import itertools
    from operator import xor

n_bits = 24

graph = dict(values)
vertices =  list(graph.keys())
U = UnionFind(vertices)
shifts = [1 << i for i in range(n_bits)]
#shifts.insert(0,0)
for x,y in itertools.combinations(list(range(n_bits)), 2):
  a = 1 << x
  b = 1 << y
  shifts.append(xor(a,b))

    #print(len(shifts)) 

for distance in vertices:
  for mask in shifts:
    b = xor(distance, mask)

    if b in graph and U.find(b) != U.find(distance):
      U.union(b, distance)

print(U.n_comps)

6118
