In [3]:
import sys
import json
import pandas as pd

sys.path.append('..')
import ultrametric_distance

In [4]:
def convert_nodes_to_json_tree(node_list):
    """
    Convert a list of hierarchical node names to JSON tree structure.
    
    Args:
        node_list: List of node names like ["p__node1", "p__node1;c__node2", ...]
    
    Returns:
        JSON tree structure where keys are full node names and values are children
        Leaf nodes have {} as value, non-leaf nodes have dict of children
    """
    
    def parse_node_path(node_name):
        """Parse a node name into its hierarchical path components."""
        parts = node_name.split(';')
        return [part for part in parts if '__' in part]
    
    # First pass: collect all nodes and determine which are leaves
    all_nodes = set()
    parent_nodes = set()
    
    for node_name in node_list:
        if node_name.strip():
            path = parse_node_path(node_name)
            for i, full_name in enumerate(path):
                all_nodes.add(full_name)
                # If this is not the last node in the path, it's a parent
                if i < len(path) - 1:
                    parent_nodes.add(full_name)
    
    # Leaf nodes are those that are never parents
    leaf_nodes = all_nodes - parent_nodes
    
    def add_node_to_tree(tree, path):
        """Add a node path to the tree structure."""
        current = tree
        
        for i, full_name in enumerate(path):
            if full_name not in current:
                # Check if this node is a leaf
                if full_name in leaf_nodes:
                    current[full_name] = {}
                else:
                    current[full_name] = {}
            
            # Move to the next level if this node can have children
            if full_name not in leaf_nodes and i < len(path) - 1:
                current = current[full_name]
    
    # Initialize root tree
    tree = {}
    
    # Process each node
    for node_name in node_list:
        if node_name.strip():
            path = parse_node_path(node_name)
            if path:
                add_node_to_tree(tree, path)
    
    return {"root": tree}

In [8]:
taxa_matrix = pd.read_csv("../data/do1200_microbiome_taxa.csv",  index_col='sample_id')
sample_nodes = taxa_matrix.columns.to_list()

# Convert to JSON tree, keep all nodes
tree_all = convert_nodes_to_json_tree(sample_nodes)

# Convert to JSON tree, remove nodes with values below threshold
tree_subset = convert_nodes_to_json_tree(taxa_matrix.loc["DO0561"].index[taxa_matrix.loc["DO0561"] > 0.2].to_list())

print("\nSubset of tree:")
print(json.dumps(tree_subset, indent=2))


Subset of tree:
{
  "root": {
    "p__Firmicutes": {},
    "p__Bacteroidota": {
      "c__Bacteroidia": {
        "o__Bacteroidales": {
          "f__Muribaculaceae": {}
        }
      }
    },
    "p__Actinobacteriota": {
      "c__Actinobacteria": {
        "o__Bifidobacteriales": {
          "f__Bifidobacteriaceae": {
            "g__Bifidobacterium": {}
          }
        }
      }
    }
  }
}


In [9]:
# test tree distance
ultrametric_distance.get_ultrametric_distance(tree_subset, tree_subset)

0.06196510982275294