## Load data

In [185]:
import numpy as np
import pandas as pd
from bigtree import Node, dataframe_to_tree_by_relation, postorder_iter, levelorder_iter

In [186]:
path = "data/tree.txt"
with open(path) as f:
    seq_length = int(f.readline().strip('\n'))
    df = pd.read_csv(f, header=None)
df.rename(columns={0: 'parent_id', 1: 'distance', 2: 'seq'}, inplace=True)
df

Unnamed: 0,parent_id,distance,seq
0,707,0.026966,UUCGGUGGUUAUAGCGGUGGGGACACGCCCGGUCCCAUUCGAACCC...
1,708,0.008823,UUCGGUGGUCACAGCGGUGGGGAAACGCCCGGUCCCAUUCGAACCC...
2,709,0.026650,UUCGGUGGUAAUAGCGGUGGGGAAACGCCCGGUCCCAUUCGAACCC...
3,712,0.121237,-UCGGUGGCCAUAGCAGCAGGGAA-CGCCCGGACCCAUUCGAACCC...
4,713,0.017874,UUCGGUGGUUUUAGCGUCAGGGAAACGCCCGGUCCCAUUCGAACCC...
...,...,...,...
1405,1404,0.008787,
1406,1402,0.018323,
1407,1406,0.026961,
1408,1407,0.063071,


In [187]:
df['child_id'] = df.index.map(str)

In [188]:
df['parent_id'] = df.parent_id.map(str)

In [189]:
root = dataframe_to_tree_by_relation(df, child_col='child_id', parent_col='parent_id', attribute_cols=['distance', 'seq'])

## Better ordering

Here we sort the nodes according to their co-height (heght of the subtree below it). We still get a topological ordering, and all leaves will be listed first.

In [190]:
from bigtree import find_full_path

In [191]:
aux = root.copy()
target_order = []
while len(aux.children) != 0:
    leaves = list(aux.leaves)
    target_order.extend((node.path_name for node in sorted(list(aux.leaves), key=lambda node: int(node.name))))
    for leaf in leaves:
        leaf.parent.children = tuple(filter(lambda child: child is not leaf, leaf.parent.children))

In [192]:
old_to_new = {}
for n, node_path in enumerate(target_order):
    node = find_full_path(root, node_path)
    old_to_new[node.name] = n

In [193]:
save_path = 'data/tree_topological.csv'
with open(save_path, 'w') as file:
    file.write(f"parent,left,right,distance,sequence\n")
    for n, node_path in enumerate(target_order):
        node = find_full_path(root, node_path)
        children = [str(old_to_new[x.name]) for x in node.children]
        while len(children) < 2:
            children.append('')
        if len(children) == 3:
            file.write(f"{','.join(children)},{getattr(node, 'distance', '')},{getattr(node, 'seq', '')}\n")
        else:    
            file.write(f"{old_to_new[node.parent.name]},{','.join(children)},{getattr(node, 'distance', '')},{getattr(node, 'seq', '')}\n")

## DFS postorder

Topological ordering, but has a drawback that some non-terminal nodes come in front of some leaves.

In [6]:
old_to_new = {}
#new_to_old = {}

In [7]:
for n, node in enumerate(postorder_iter(root)):
    old_to_new[node.name] = n
    #new_to_old[n] = node.name

In [9]:
start = root.children[0]

In [10]:
save_path = 'data/tree_preprocessed.csv'
with open(save_path, 'w') as file:
    file.write(f"parent,left,right,distance,sequence\n")
    for n, node in enumerate(postorder_iter(start)):
        children = [str(old_to_new[x.name]) for x in node.children]
        while len(children) < 2:
            children.append('')
        if len(children) == 3:
            file.write(f"{','.join(children)},{getattr(node, 'distance', '')},{getattr(node, 'seq', '')}\n")
        else:    
            file.write(f"{old_to_new[node.parent.name]},{','.join(children)},{getattr(node, 'distance', '')},{getattr(node, 'seq', '')}\n")