Skip to content

Commit

Permalink
Split item extraction into smaller functions
Browse files Browse the repository at this point in the history
  • Loading branch information
plafl committed Nov 16, 2015
1 parent 4ff0e4c commit cfcc08b
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 39 deletions.
103 changes: 69 additions & 34 deletions aile/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def dtw_path(DTW):
return s, t


def dtw_match_1(s, t, D):
def dtw_match(s, t, D):
s = s.copy()
for i, j in enumerate(s):
m = k = i
Expand All @@ -277,11 +277,42 @@ def dtw_match_1(s, t, D):
return s


def dtw_match_2(s, t, D):
return dtw_match_1(t, s, D.T)
def path_distance(p1, p2):
N1 = len(p1)
N2 = len(p2)
D = np.zeros((N1, N2))
for i in range(N1):
q1 = p1[i]
for j in range(N2):
q2 = p2[j]
D[i, j] = max(len(q1), len(q2))
for a, b in zip(q1, q2):
if a != b:
break
D[i, j] -= 1
return D


def extract_items(ptree, trees, labels):
def find_cliques(G, min_size):
cliques = []
for K in nx.find_cliques(G):
if len(K) >= min_size:
cliques.append(set(K))
cliques.sort(reverse=True, key=lambda x: len(x))
L = set()
for K in cliques:
K -= L
L |= K
cliques = [K for K in cliques if len(K) >= min_size]
node_to_clique = {}
for i, K in enumerate(cliques):
for node in K:
if node not in node_to_clique:
node_to_clique[node] = i
return node_to_clique


def paths_and_nodes(ptree, trees, labels):
all_paths = []
all_nodes = []
for tree in trees:
Expand All @@ -293,51 +324,55 @@ def extract_items(ptree, trees, labels):
nodes.append(path[0])
all_paths.append(paths)
all_nodes.append(nodes)
return all_paths, all_nodes


def match_graph(all_paths, all_nodes):
G = nx.Graph()
for (p1, n1), (p2, n2) in itertools.combinations(
zip(all_paths, all_nodes), 2):
N1 = len(p1)
N2 = len(p2)
D = np.zeros((N1, N2))
for i in range(N1):
q1 = p1[i]
for j in range(N2):
q2 = p2[j]
D[i, j] = max(len(q1), len(q2))
for a, b in zip(q1, q2):
if a != b:
break
D[i, j] -= 1
D = path_distance(p1, p2)
DTW = dtw(D)
a1, a2 = dtw_path(DTW)
m = dtw_match_1(a1, a2, D)
m = dtw_match(a1, a2, D)
for i, j in enumerate(m):
if j != -1:
G.add_edge(n1[i], n2[j])
return G

cliques = []
for K in nx.find_cliques(G):
if len(K) >= 0.5*len(trees):
cliques.append(K)
cliques.sort(reverse=True, key=lambda x: len(x))
node_to_clique = {}
for i, K in enumerate(cliques):
for node in K:
if node not in node_to_clique:
node_to_clique[node] = i

def align_items(ptree, trees, node_to_clique):
n_cols = max(node_to_clique.values()) + 1
items = np.zeros((len(trees), n_cols)) - 1
items = np.zeros((len(trees), n_cols), dtype=int) - 1
for i, tree in enumerate(trees):
children = []
for root in tree:
children += range(root, max(root + 1, ptree.match[root]))
for c in children:
col = node_to_clique.get(c)
if col:
items[i, col] = c
for c in range(root, max(root + 1, ptree.match[root])):
try:
items[i, node_to_clique[c]] = c
except KeyError:
pass
return items


def extract_items(ptree, trees, labels):
return align_items(
ptree,
trees,
find_cliques(
match_graph(*paths_and_nodes(ptree, trees, labels)),
0.5*len(trees))
)


class ItemExtract(object):
def __init__(self, page_tree):
self.page_tree = page_tree
self.kernel = kernel(page_tree)
self.labels = cluster(page_tree, self.kernel)
self.trees = extract_trees(page_tree, self.labels)
self.items = extract_items(page_tree, self.trees, self.labels)


# Import cython functions
########################################################################
build_counts = _ker.build_counts
Expand Down
7 changes: 2 additions & 5 deletions test/test_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@ def build_tree(ptree, labels=None):

t1 = time.clock()
page_tree = ker.PageTree(page)
K = ker.kernel(page_tree)
l = ker.cluster(page_tree, K)
trees = ker.extract_trees(page_tree, l)
items = ker.extract_items(page_tree, trees, l)
ie = ker.ItemExtract(page_tree)
print 'Total time: {0} seconds'.format(time.clock() - t1)
t = build_tree(page_tree, labels=l)
t = build_tree(page_tree, labels=ie.labels)
t.show()

0 comments on commit cfcc08b

Please sign in to comment.