In [1]:
import numpy as np
import time
from libs.nodes import *

# Intersect Complete

Data generated from a given set of "observed" nodes by taking the intersections off all the nodes among each other. This process is repeated
(including with the new data) until no further intersections can be generated.

In [2]:
# Manually finding all possible inferred ancestors
def intersect_scan(nodes):
    features = set(nodes)
    frontier = set(nodes)
    
    passes = 0
    
    while len(frontier) > 0:
        new_frontier = set()
        new_features = set()
        
        passes += 1
        print(f" - Pass {passes} with {len(frontier)} new nodes")

        for n1 in frontier:
            for n2 in features:
                if len(n1) > len(n2):
                    continue
                
                intersect = n1 & n2
                
                if intersect not in features:
                    if intersect:
                        new_features.add(intersect)
                        new_frontier.add(intersect)
        
        frontier = list(new_frontier)
        features |= new_features
    return features
nodes = parse_nodes("../../data/dirty/1109.txt")
np.random.shuffle(nodes)
nodes = nodes[:212]

print(f"Input Nodes", len(nodes))

t0 = time.time()
scan = intersect_scan(nodes)
t1 = time.time()

print(f"Time: {t1-t0}")
print('Output Nodes', len(scan))


# Write results to file
results = list(scan)
write_nodes("../../data/int_complete/13516_out.nodes", results)

# Write input nodes to file
write_nodes("../../data/int_complete/setA/212_in.nodes", nodes)


Input Nodes 212
 - Pass 1 with 212 new nodes
 - Pass 2 with 2095 new nodes


KeyboardInterrupt: 

# Adjacent Intersections

Referring to intersections of given, observed nodes and other inferred nodes, but with the constraint that all inferred nodes share at least
one leg with an observed node. That is, intersections are generated either by observed <-> observed or observed <-> inferred, but never
inferred <-> inferred.

In [5]:
def int_adj(nodes: list[node]):
    total = set()
    frontier = set()
    
    frontier.update(nodes) # Start with all nodes
    
    while len(frontier) > 0:
        new_frontier = set()
        
        for n1 in nodes:
            for n2 in frontier:
                # n1 > n2 condition prevents duplicate comparisons
                if n1 == n2 or len(n1) > len(n2):
                    continue
                
                intersect = n1 & n2
                
                if intersect and intersect not in total:
                    new_frontier.add(intersect)
        
        total.update(frontier)
        frontier = new_frontier
        print(f"- {len(frontier)} new --> {len(total)} accumulated")
    
    return total

In [6]:
test_data = [
    node('1234'),
    node('1235'),
    node('1267'),
]

int_adj(test_data)

- 2 new --> 3 accumulated
- 0 new --> 5 accumulated


{(1,2), (1,2,3,4), (1,2,3,5), (1,2,6,7), (1,2,3)}

The test is successful. $(1,2,3)$ is an observed <-> observed inference. But, $(1, 2)$ is an observed <-> inferred inference.

In [7]:
nodes = parse_nodes("../../data/dirty/1109.txt")
np.random.shuffle(nodes)
nodes_in = nodes[:500]
nodes_out = int_adj(nodes)

- 6899 new --> 1109 accumulated
- 15845 new --> 7074 accumulated
- 11709 new --> 16954 accumulated
- 1883 new --> 18783 accumulated
- 54 new --> 18837 accumulated
- 0 new --> 18837 accumulated


In [8]:
write_nodes("../../data/int_adj/setA/500_in.nodes", nodes_in)
write_nodes("../../data/int_adj/setA/18837_out.nodes", nodes_out)