In [89]:
# Load necessary libraries
library(ape)
library(phangorn)

In [99]:

# @description
# This function finds the edge that linked to root by read the tree$edge matrix.
# @param tree A phylo format phylogenetic tree.
# @return A vector of unique integers that represent the node number that linked to root.
find_root_edge <- function(tree) {
  # Get the edge matrix
  matrix <- tree$edge
  root_node <- 1 + length(tree$tip.label)
  
  # Extract all rows that contain the number 1
  rows_with_one <- matrix[apply(matrix, 1, function(row) any(row == root_node)), ]
  # Flatten the matrix to a vector
  numbers <- as.vector(rows_with_one)
  # Get unique numbers
  unique_numbers <- unique(numbers)
  # Remove the number 1
  result <- unique_numbers[unique_numbers != root_node]
  
  return(result)
}

#' @description Process a phylogenetic tree by repeatedly rooting at the midpoint and splitting until all subtrees are within the specified size limits.
#' @param tree_file Path to the Newick file containing the phylogenetic tree.
#' @param tree_size_lower_lim Minimum size of the subtrees to keep.
#' @param tree_size_upper_lim Maximum size of the subtrees to split.
#' @param output_dir Directory to save the processed subtrees.
process_phylogenetic_tree <- function(tree_file, tree_size_lower_lim, tree_size_upper_lim, output_dir) {
  # Read the tree from the file
  unsolve_subtree_list <- list(read.tree(tree_file))
  solved_subtree_list <- list()
  process_tree_list <- list()

  # Loop until unsolve_subtree_list is empty
  while (length(unsolve_subtree_list) > 0) {
    for (tree in unsolve_subtree_list) {
      # Check the size of the tree
      tree_size <- length(tree$tip.label)
      if (tree_size < tree_size_lower_lim) {
        # Discard the tree if it's too small
        next
      } else if (tree_size <= tree_size_upper_lim) {
        # Keep the tree if it's within the size limits
        solved_subtree_list <- append(solved_subtree_list, list(tree))
      } else {
        # Process the tree if it's too large
        tree <- midpoint(tree, node.labels = "support")
        # Extract subtrees
        root_edge_nodes <- find_root_edge(tree)
        subtree1 <- extract.clade(tree, root_edge_nodes[[1]], collapse.singles = TRUE)
        subtree2 <- extract.clade(tree, root_edge_nodes[[2]], collapse.singles = TRUE)
        process_tree_list <- append(process_tree_list, list(subtree1, subtree2))
      }
    }
    # Replace unsolve_subtree_list with process_tree_list and clear process_tree_list
    unsolve_subtree_list <- process_tree_list
    process_tree_list <- list()
  }

  # Save the solved subtrees to the output directory
  for (i in seq_along(solved_subtree_list)) {
    write.tree(solved_subtree_list[[i]], file = file.path(output_dir, paste0("subtree_", i, ".nwk")))
  }
}

process_phylogenetic_tree(tree_file = "/home/tim/project/GTDB_TREE/Result_rona/test/p__Marinisomatota_275/ref_tree.tre", tree_size_lower_lim = 3, tree_size_upper_lim = 30, output_dir = "/home/tim/project/GTDB_TREE/Result_rona/test/p__Marinisomatota_275/loop_1/mid_point_trees")


In [96]:


tree_file = "/home/tim/project/GTDB_TREE/Result_rona/test/p__Marinisomatota_275/ref_tree.tre"
tree_size_lower_lim = 3
tree_size_upper_lim = 30

unsolve_subtree_list <- list(read.tree(tree_file))
solved_subtree_list <- list()
process_tree_list <- list()


In [97]:
# @description
# This function finds the edge that linked to root by read the tree$edge matrix.
# @param tree A phylo format phylogenetic tree.
# @return A vector of unique integers that represent the node number that linked to root.
find_root_edge <- function(tree) {
  # Get the edge matrix
  matrix <- tree$edge
  root_node <- 1 + length(tree$tip.label)
  # Extract all rows that contain the number 1
  rows_with_one <- matrix[apply(matrix, 1, function(row) any(row == root_node)), ]
  
  # Flatten the matrix to a vector
  numbers <- as.vector(rows_with_one)
  
  # Get unique numbers
  unique_numbers <- unique(numbers)
  
  # Remove the number 1
  result <- unique_numbers[unique_numbers != root_node]
  
  return(result)
}

# Loop until unsolve_subtree_list is empty
while (length(unsolve_subtree_list) > 0) {
for (tree in unsolve_subtree_list) {
    # Check the size of the tree
    tree_size <- length(tree$tip.label)
    print(tree_size)
    if (tree_size < tree_size_lower_lim) {
    # Discard the tree if it's too small
    next
    } else if (tree_size <= tree_size_upper_lim) {
    # Keep the tree if it's within the size limits
    solved_subtree_list <- append(solved_subtree_list, list(tree))
    } else {
    # Process the tree if it's too large
    tree <- midpoint(tree, node.labels = "support")
    # Extract subtrees
    root_edge_nodes <- find_root_edge(tree)
    subtree1 <- extract.clade(tree, root_edge_nodes[[1]], collapse.singles = TRUE)
    subtree2 <- extract.clade(tree, root_edge_nodes[[2]], collapse.singles = TRUE)
    process_tree_list <- append(process_tree_list, list(subtree1, subtree2))
    }
}
# Replace unsolve_subtree_list with process_tree_list and clear process_tree_list
unsolve_subtree_list <- process_tree_list
process_tree_list <- list()
}

[1] 266
[1] 44
[1] 222
[1] 10
[1] 34
[1] 147
[1] 75
[1] 16
[1] 18
[1] 16
[1] 131
[1] 44
[1] 31
[1] 102
[1] 29
[1] 19
[1] 25
[1] 23
[1] 8
[1] 65
[1] 37
[1] 30
[1] 35
[1] 23
[1] 14
[1] 19
[1] 16


In [98]:
solved_subtree_list

[[1]]

Phylogenetic tree with 10 tips and 9 internal nodes.

Tip labels:
  G022560655, G020354025, G020355065, G020346745, G022563485, G030640955, ...
Node labels:
  100.00, 68.00, 76.00, 88.00, 100.00, 100.00, ...

Rooted; includes branch lengths.

[[2]]

Phylogenetic tree with 16 tips and 15 internal nodes.

Tip labels:
  G002719095, G002703645, G002171125, G002731795, G002170735, G002717625, ...
Node labels:
  75.00, 75.00, 76.00, 99.00, 100.00, 96.00, ...

Rooted; includes branch lengths.

[[3]]

Phylogenetic tree with 18 tips and 17 internal nodes.

Tip labels:
  G002704045, G002703375, G014381925, G002689995, G030747125, G030748955, ...
Node labels:
  75.00, 56.00, 38.00, 51.00, 100.00, 100.00, ...

Rooted; includes branch lengths.

[[4]]

Phylogenetic tree with 16 tips and 15 internal nodes.

Tip labels:
  G014381485, G004124475, G022562455, G002471865, G002402135, G002691365, ...
Node labels:
  100.00, 62.00, 64.00, 100.00, 100.00, 100.00, ...

Rooted; includes branch lengths.


In [75]:
subset(tree$edge, tree$edge[, 2] == 1 + length(tree$tip.label))

In [59]:
extract.clade(tree, 268, collapse.singles = TRUE)


Phylogenetic tree with 5 tips and 4 internal nodes.

Tip labels:
  G022573815, G022567655, G022571215, G004402895, G004402915
Node labels:
  100.00, 94.00, 100.00, 100.00

Rooted; includes branch lengths.

In [77]:
extract.clade(tree, , collapse.singles = TRUE)


Phylogenetic tree with 261 tips and 260 internal nodes.

Tip labels:
  G016784265, G018263365, G021734025, G001872685, G021734065, G021734045, ...
Node labels:
  100.00, 64.00, 100.00, 79.00, 48.00, 100.00, ...

Rooted; includes branch lengths.

In [78]:
1 + length(tree$tip.label)