In [8]:
import time
import pandas as pd

In [9]:
import io_utils


def isolate_lineage(node, data):
    super_lineage = [node]
    sub_lineage = [node]
    
    for n in data:
        if len(n) == len(node):
            continue
        if set(n).issubset(set(node)):
            sub_lineage.append(n)
        if set(node).issubset(set(n)):
            super_lineage.append(n)
    
    return sub_lineage, super_lineage


def break_by_lineage(data):
    bySize = dict()

    for node in data:
        size = len(node)
        bySize.setdefault(size, []).append(node)

    lineage_lines = []
    remaining = set(data)
    
    while len(remaining) > 0:
        next_key = sorted(bySize.keys())[-1]
        next_layer = set(bySize[next_key]).intersection(remaining)
        bySize.pop(next_key)

        for node in next_layer:
            lineage = isolate_lineage(node, data)
            lineage_lines.append(lineage[0])
            lineage_lines.append(lineage[1])
            remaining -= set(lineage[0])
            remaining -= set(lineage[1])

    return lineage_lines

In [10]:
data = io_utils.import_data('../data/6.txt')
print(break_by_lineage(data))

[[(1, 2, 3, 4, 5), (2,), (1, 2), (2, 3), (1, 2, 3), (1, 2, 3, 4)], [(1, 2, 3, 4, 5)]]


In [11]:
dataset = '3515'
data = io_utils.import_data(f'../data/{dataset}.txt')
t0 = time.time()
chunks = break_by_lineage(data)
print(f"Time: {time.time() - t0}")

Time: 4.121416091918945


# Success?

In [12]:
known_nodes = set()
doubled_nodes = set()

for chunk in chunks:
    for node in chunk:
        if node in known_nodes:
            doubled_nodes.add(node)
        known_nodes.add(node)

print(f"Of the {len(known_nodes)} nodes, {len(doubled_nodes)} are doubled")
print(f"{len(known_nodes) - len(doubled_nodes)} are unique")
print(f"\tOverlap: {len(doubled_nodes) * 100 / len(known_nodes)}%")

print(f"{len(chunks)} chunks")
df = pd.DataFrame([len(chunk) for chunk in chunks])
df.describe()

Of the 3515 nodes, 3267 are doubled
248 are unique
	Overlap: 92.94452347083926%
656 chunks


Unnamed: 0,0
count,656.0
mean,212.285061
std,480.169454
min,1.0
25%,1.0
50%,2.0
75%,165.0
max,1999.0
