# Collapsing branches in a phylogenetic tree

Setting up

In [1]:
import re
from collections import Counter
from ete3 import Tree
from ete3 import PhyloNode
import pandas as pd
from Bio import SeqIO

Loading tree

In [2]:
streps = Tree("../supplementary_file_10/strep/TBE.raxml.support")

Loading cluster members data

In [3]:
data = pd.read_csv("../supplementary_file_1/cluster_taxID_info.csv")

Get data for leaf annotation, and add annotations for outgroups.

In [4]:
data = data.loc[data['theresholds_id'] == '100%']

In [5]:
accession_members = {k:str(dict(Counter(v.split(', ')))).replace("{", "").replace('}', '').replace("'", '').replace(": ", ' _').replace(', ', '_ ')+'_' for k, v in  data.set_index('cluster_representative').to_dict()['member_names'].items()}

In [6]:
outgroups = list(SeqIO.parse("../supplementary_file_9/data/outgroups.fasta", "fasta"))

In [7]:
for _ in outgroups:
    if 'Bacteria;A' in _.description.split(' ')[1]:
        accession_members[_.description.split(' ')[0]] = _.description.split(';')[-1]
    if 's_' in _.description:
        accession_members[_.description.split(' ')[0]] = ' '.join(_.description.split(';')[-2::]).replace('g__', '').replace(' s__', '')
    if 'S004078527' in _.description:
        accession_members[_.description.split(' ')[0]] = ' '.join(_.description.split(' ')[1:3]).strip(';')

In [8]:
for _ in streps:
    _.name = accession_members[_.name]

Some species names are vague and unclear eg. *rock porewater*. These species can be ignored when collapsing branches in a phylogenetic tree.

List of species to ignore:

In [9]:
ignore_list = ("uncultured Streptomyces", "uncultured bacterium",
               "uncultured actinobacterium", "bacterium",
               "Streptomyces sp.", "synthetic construct",
               "Actinomycete", "Candidatus Streptomyces",
               "Streptomycetaceae SR", "uncultured Actinomycetales",
               "Bacillus sp.", "Streptomyces cf.",
               "Actinomycetales bacterium", "Streptomycetaceae bacterium", "rock porewater",
               "Actinobacteria bacterium", "Actinomyces sp.")

### Exploring *Strep* tree
Reasons why to explore tree before altering it:
- to understand what needs to be done
- to identify possible issues
- to find a solution for the identified issues

`ete3` allows to retive all leaf nodes with `get_leaves()`. 

This method searched the whole tree structure and returned the list of terminal nodes.

In [10]:
leaves = streps.get_leaves() # get all leaf nodes

The list of terminal nodes can be then searched to identify leaf nodes with specific species. For instance, we can search for leaf nodes with *Streptomyces malaysiensis*.

In [11]:
targets = [_ for _ in leaves if "malaysiensis" in _.name]

With `ete3` we can find common ancestor for a node or a list of nodes with `get_common_ancestor()` method. 

In [12]:
common_ancestor = targets[0].get_common_ancestor(targets)

In [13]:
print(common_ancestor)


      /-Streptomyces autolyticus _1_
     |
     |      /-Streptomyces sp. _3_
     |     |
     |     |      /-Streptomyces sp. _3_
     |     |   /-|
   /-|   /-|  |   \-Streptomyces sp. _3_
  |  |  |  |  |
  |  |  |  |  |         /-Streptomyces sp. _3_
  |  |  |  |  |      /-|
  |  |  |   \-|     |  |   /-Streptomyces sp. _3_
  |  |  |     |     |   \-|
  |  |  |     |   /-|      \-Streptomyces sp. _3_
  |   \-|     |  |  |
  |     |     |  |  |   /-Streptomyces sp. _1_
  |     |      \-|   \-|
  |     |        |      \-Streptomyces sp. _2_
  |     |        |
  |     |         \-Streptomyces sp. _3_
  |     |
  |      \-Streptomyces malaysiensis _1_ Streptomyces autolyticus _3_ Streptomyces sp. _3_
  |
  |   /-Streptomyces sp. _1_
  |  |
  |  |            /-Streptomyces sp. _3_
  |  |           |
  |  |           |   /-Streptomyces sp. _1_
  |  |           |  |
--|  |           |  |      /-Streptomyces malaysiensis _1_
  |  |           |  |     |
  |  |           |  |     |      /-

#### Checking what proportion of leaf nodes under an internal node share the same species name. 

In [14]:
def get_species_names(node, ignore=("Streptomyces sp")):
    """Return Counter of all species names under an internal node.
    
    :param ignore:  collection of species names to ignore
    """
    species_names = []  # hold list of species names that have been seen
    
    leaf_names = [_.name for _ in node.get_leaves()]  # what species names are there in *each leaf*?
    for name in leaf_names:
        species_names.extend([_.strip() for _ in re.split("_[0-9]*_", name) \
                              if (_ and _.strip() not in ignore)])
        
    return Counter(species_names)

In [15]:
get_species_names(common_ancestor)

Counter({'Streptomyces autolyticus': 3,
         'Streptomyces sp.': 35,
         'Streptomyces malaysiensis': 18,
         'Streptomyces solisilvae': 1,
         'Streptomyces cameroonensis': 1})

***Problem to solve***: When the sequences were clustered at 100% ID and after the trimmed alignment was dereplicated, all annotations associated with each representative sequences were recorded. The number in `_[0-9]_`, provides information of how many times the same species name has been seen within that particular cluster. 

Although the count of *Streptomyces malaysiensis* is 29, the function returns 21. This is because this function does not take into account ther number of occurance in `_[0-9]_`

So, to solve this problem `get_species_names` function was updated.   


In [16]:
def get_species_names(node, ignore=ignore_list):
    """Return Counter of all species names under an internal node.
    Takes into account number of species occurance in _[0-9]_.
    
    Parameters:
    
       node (ete3.coretype.tree.TreeNode): Leaf node.
       ignore (list): Collection of species names to ignore.
    
    Returns:
        Counter (dict): Counter of all species names under an internal node.
    
    
    Example:
        
        
        >>> t_node = Tree("(Streptomyces malaysiensis _1_,(Streptomyces autolyticus _2_ , \
                             (Streptomyces cameroonensis _1_,(Streptomyces samsuensis _2_, \
                             Streptomyces samsuensis _6_ Streptomyces sp. _1_))));") 
        >>> ignore_list = ['Streptomyces sp.']
        >>> get_species_names(t_node, ignore=ignore_list)
        Counter({'Streptomyces malaysiensis': 1,
             'Streptomyces autolyticus': 2,
             'Streptomyces cameroonensis': 1,
             'Streptomyces samsuensis': 8})
         
    """
    species_names = []  # hold list of species names that have been seen
    
    
    leaf_names = [_.name for _ in node.get_leaves()]  # what species names are there in *each leaf*?
    for name in leaf_names:
        species_names.extend([_.strip() for _ in re.split("_[0-9]*_", name) \
                              if (_ and _.strip() not in ignore)])
    
    
    
    
    count_leaf_names = [] # hold list of species names including their count
    
    for name in leaf_names:  # 
        count_leaf_names.extend([_.strip() for _ in re.split("_ ", name)])
    
    exclude_leaf_names_count = [] # hold list of species names including their count, 
                                        #but excluding those in ignore_list
        
    exclude_leaf_names_count.extend([_ for _ in count_leaf_names \
        if re.sub(" _[0-9]*_?","" ,_) not in ignore])
    
    count = [] # hold list for species count
    for _ in exclude_leaf_names_count:
        count.extend([num for num in re.findall(r'\d+', _)])
    

    
    all_species_count = zip(species_names, count) 
    
    new_species_names = [] # hold a list of species names that have been seen and multiply by their count
    for _ in all_species_count:
        new_species_names.extend([_[0] for i in range(int(_[1]))])
    return Counter(new_species_names)

Runing a test for *Streptomyces malaysiensis*, to see whether it will return the right counter for all species names.

In [17]:
get_species_names(common_ancestor)

Counter({'Streptomyces autolyticus': 5,
         'Streptomyces malaysiensis': 25,
         'Streptomyces solisilvae': 2,
         'Streptomyces cameroonensis': 2})

Now, the count for *Streptomyces malaysiensis* is 29. 


### Collapsing the tree: top-down approach

Each internal node is examined and leaf nodes below that node which contain only a single species name are identified with the following function:

In [18]:
def collapse_branches_with_one_species(node):
    """Return True if node can be collapsed.
    
    
    Ignores the same species as in get_species_names(), and
    collapses branches with a single species.  
    
    Parameters:
       node (ete3.coretype.tree.TreeNode): Leaf node.
    
    Returns:
       Boolean: True if node can be collapsed, otherwise False.

    """
      
    
    node_names = get_species_names(node)
    
    

    if len(node_names) == 0:  # no valid species names
        node.name = "no_species_info"
        return True
    
    if len(node_names) == 1:  # only one valid species name
        new_name = list(node_names.keys())[0]
        if not node.is_leaf():
            new_name += " [COLLAPSED]"
        node.name = new_name
        return True

    if node.is_leaf():
        return True
    else:
        return False

# Testing the function to see whether it exhibits "desired" behaviour 

Does it collapse any branches when no duplicates are found in the tree?

In [19]:
no_duplicates = Tree("(Streptomyces malaysiensis _1_,(Streptomyces autolyticus _2_ , \
             (Streptomyces cameroonensis _1_,(Streptomyces samsuensis _2_,Streptomyces solisilvae _1_))));")
print(f'Tree node before collapse: {no_duplicates}')

no_duplicates_collapsed = Tree(no_duplicates.write(is_leaf_fn=collapse_branches_with_one_species))
print(f'Tree node after collapse: {no_duplicates_collapsed}')

Tree node before collapse: 
   /-Streptomyces malaysiensis _1_
--|
  |   /-Streptomyces autolyticus _2_
   \-|
     |   /-Streptomyces cameroonensis _1_
      \-|
        |   /-Streptomyces samsuensis _2_
         \-|
            \-Streptomyces solisilvae _1_
Tree node after collapse: 
   /-Streptomyces malaysiensis
--|
  |   /-Streptomyces autolyticus
   \-|
     |   /-Streptomyces cameroonensis
      \-|
        |   /-Streptomyces samsuensis
         \-|
            \-Streptomyces solisilvae


Does it collapse any branches, when some nodes represent mutiple species?

In [20]:
multi_species = Tree("(Streptomyces malaysiensis _1_,(Streptomyces autolyticus _2_ , \
             (Streptomyces cameroonensis _1_,(Streptomyces samsuensis _2_,Streptomyces samsuensis _6_ Streptomyces solisilvae _1_))));")
print(f'Tree before collapse: {multi_species}')

multi_species_collapsed = Tree(multi_species.write(is_leaf_fn=collapse_branches_with_one_species))
print(f'Tree after collapse: {multi_species_collapsed}')

Tree before collapse: 
   /-Streptomyces malaysiensis _1_
--|
  |   /-Streptomyces autolyticus _2_
   \-|
     |   /-Streptomyces cameroonensis _1_
      \-|
        |   /-Streptomyces samsuensis _2_
         \-|
            \-Streptomyces samsuensis _6_ Streptomyces solisilvae _1_
Tree after collapse: 
   /-Streptomyces malaysiensis
--|
  |   /-Streptomyces autolyticus
   \-|
     |   /-Streptomyces cameroonensis
      \-|
        |   /-Streptomyces samsuensis
         \-|
            \-Streptomyces samsuensis _6_ Streptomyces solisilvae _1_


Does it collapse any branches when some nodes represent species in ignore list?

In [21]:
species_ignore = Tree("(Streptomyces malaysiensis _1_,(Streptomyces sp. _2_ , \
             (Streptomyces sp. _1_,(Streptomyces malaysiensis _2_, synthetic construct _1_))));")
print(f'Tree before collapse: {species_ignore}')

species_ignore_collapsed = Tree(species_ignore.write(is_leaf_fn=collapse_branches_with_one_species))
print(f'Tree after collapse: {species_ignore_collapsed}')

Tree before collapse: 
   /-Streptomyces malaysiensis _1_
--|
  |   /-Streptomyces sp. _2_
   \-|
     |   /-Streptomyces sp. _1_
      \-|
        |   /-Streptomyces malaysiensis _2_
         \-|
            \-synthetic construct _1_
Tree after collapse: 
--Streptomyces malaysiensis _COLLAPSED_


Does it collapse branches when duplicates are represented by 1 "valid" species and 1 species in ignore list?

In [22]:
multi_species_ignore = Tree("(Streptomyces malaysiensis _1_,(Streptomyces autolyticus _2_ , \
             (Streptomyces cameroonensis _1_,(Streptomyces samsuensis _2_,Streptomyces samsuensis _6_ Streptomyces sp. _1_))));")
print(f'Tree before collapse: {multi_species_ignore}')

multi_species_ignore_collapsed = Tree(multi_species_ignore.write(is_leaf_fn=collapse_branches_with_one_species))
print(f'Tree after collapse: {multi_species_ignore_collapsed}')

Tree before collapse: 
   /-Streptomyces malaysiensis _1_
--|
  |   /-Streptomyces autolyticus _2_
   \-|
     |   /-Streptomyces cameroonensis _1_
      \-|
        |   /-Streptomyces samsuensis _2_
         \-|
            \-Streptomyces samsuensis _6_ Streptomyces sp. _1_
Tree after collapse: 
   /-Streptomyces malaysiensis
--|
  |   /-Streptomyces autolyticus
   \-|
     |   /-Streptomyces cameroonensis
      \-|
         \-Streptomyces samsuensis _COLLAPSED_


Does the function collapse branches when duplicates are present? *Version 1*

In [23]:
duplicates_v1 = Tree("(Streptomyces malaysiensis _1_,(Streptomyces autolyticus _2_ , \
             (Streptomyces cameroonensis _1_,(Streptomyces samsuensis _2_,Streptomyces samsuensis _6_))));")
print(f'Tree before collapse: {duplicates_v1}')

duplicates_v1_collapsed = Tree(duplicates_v1.write(is_leaf_fn=collapse_branches_with_one_species))
print(f'Tree after collapse: {duplicates_v1_collapsed}')

Tree before collapse: 
   /-Streptomyces malaysiensis _1_
--|
  |   /-Streptomyces autolyticus _2_
   \-|
     |   /-Streptomyces cameroonensis _1_
      \-|
        |   /-Streptomyces samsuensis _2_
         \-|
            \-Streptomyces samsuensis _6_
Tree after collapse: 
   /-Streptomyces malaysiensis
--|
  |   /-Streptomyces autolyticus
   \-|
     |   /-Streptomyces cameroonensis
      \-|
         \-Streptomyces samsuensis _COLLAPSED_


Does it collapse branches when majority of one species members are present in a subtree? *Version 2*

In [24]:
duplicates_v2 = Tree("(Streptomyces autolyticus _1_,(Streptomyces autolyticus _2_ , \
             (Streptomyces autolyticus _1_,(Streptomyces samsuensis _2_,Streptomyces solisilvae _6_))));")
print(f'Tree before collapse: {duplicates_v2}')

duplicates_v2_collapsed = Tree(duplicates_v2.write(is_leaf_fn=collapse_branches_with_one_species))
print(f'Tree after collapse: {duplicates_v2_collapsed}')

Tree before collapse: 
   /-Streptomyces autolyticus _1_
--|
  |   /-Streptomyces autolyticus _2_
   \-|
     |   /-Streptomyces autolyticus _1_
      \-|
        |   /-Streptomyces samsuensis _2_
         \-|
            \-Streptomyces solisilvae _6_
Tree after collapse: 
   /-Streptomyces autolyticus
--|
  |   /-Streptomyces autolyticus
   \-|
     |   /-Streptomyces autolyticus
      \-|
        |   /-Streptomyces samsuensis
         \-|
            \-Streptomyces solisilvae


We can check this function on an examplar subtree for *Streptomyces lilacinus*

In [25]:
def get_subtree(tree, species_name):
    """Return subtree corresponding to species name
    
    Parameters:
    tree (TreeNode): newick format phylogenetic tree
    species_name (str): species for subtree to be extracted from the tree
    
    """
    targets = [_ for _ in tree.get_leaves() if species_name in _.name]
    common_ancestor = targets[0].get_common_ancestor(targets)
    
    return common_ancestor

Getting subtree for *Streptomyces lilacinus*

In [26]:
subtree = get_subtree(streps, "Streptomyces lilacinus")
print(subtree)


      /-Streptomyces sp. _2_
     |
     |      /-Streptomyces sp. _2_ Streptomyces caeruleus _1_ Actinoalloteichus cyanogriseus _1_
     |     |
     |     |   /-Streptomyces hiroshimensis _3_
     |     |  |
     |     |  |      /-Streptomyces hiroshimensis _9_ Streptomyces sp. _3_
     |     |  |     |
     |     |  |     |                  /-Streptomyces lacticiproducens _3_ Streptomyces sp. _1_
     |     |  |     |               /-|
     |     |  |     |              |  |   /-Streptomyces caatingaensis _1_
     |     |  |     |            /-|   \-|
     |     |  |     |           |  |      \-Streptomyces caatingaensis _6_
     |     |  |     |         /-|  |
     |     |  |     |        |  |   \-Streptomyces sp. _2_
     |     |  |     |        |  |
     |     |  |     |      /-|   \-Streptomyces lacticiproducens _2_ Streptomyces sp. _1_
     |     |  |     |     |  |
     |     |  |     |     |  |   /-Streptomyces abikoensis _2_ Streptomyces sp. _1_
     |     |  |     |   /-| 

Collaping branches within *Streptomyces lilacinus* subtree

In [27]:
collapsed = Tree(subtree.write(is_leaf_fn=collapse_branches_with_one_species))
print(collapsed)


      /-no_species_info
     |
     |      /-Streptomyces sp. _2_ Streptomyces caeruleus _1_ Actinoalloteichus cyanogriseus _1_
     |     |
     |     |   /-Streptomyces hiroshimensis
     |     |  |
     |     |  |      /-Streptomyces hiroshimensis
     |     |  |     |
     |     |  |     |                  /-Streptomyces lacticiproducens
     |     |  |     |               /-|
     |     |  |     |            /-|   \-Streptomyces caatingaensis _COLLAPSED_
     |     |  |     |           |  |
     |     |  |     |         /-|   \-no_species_info
     |     |  |     |        |  |
     |     |  |     |      /-|   \-Streptomyces lacticiproducens
     |     |  |     |     |  |
     |     |  |     |   /-|   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |  |
     |     |  |     |  |   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |
     |     |  |     |  |      /-Streptomyces gamaensis _COLLAPSED_
     |     |  |     |  |     |
     |     |  |     |  |     |

Now applying the approach to the complete tree

In [28]:
print(f"Before collapse, tree has {len(streps)} children")

collapsed_streps = Tree(streps.write(is_leaf_fn=collapse_branches_with_one_species))

print(f"After collapse, tree has {len(collapsed_streps)} children")


Before collapse, tree has 9049 children
After collapse, tree has 4996 children


Test: Do we get the same subtree for *Streptomyces lilacinus*?

In [29]:
subtree_t1 = get_subtree(collapsed_streps, "Streptomyces lilacinus")
print(subtree_t1)


      /-no_species_info
     |
     |      /-Streptomyces sp. _2_ Streptomyces caeruleus _1_ Actinoalloteichus cyanogriseus _1_
     |     |
     |     |   /-no_species_info
     |     |  |
     |     |  |      /-no_species_info
     |     |  |     |
     |     |  |     |                  /-no_species_info
     |     |  |     |               /-|
     |     |  |     |            /-|   \-Streptomyces caatingaensis _COLLAPSED_
     |     |  |     |           |  |
     |     |  |     |         /-|   \-no_species_info
     |     |  |     |        |  |
     |     |  |     |      /-|   \-no_species_info
     |     |  |     |     |  |
     |     |  |     |   /-|   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |  |
     |     |  |     |  |   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |
     |     |  |     |  |      /-Streptomyces gamaensis _COLLAPSED_
     |     |  |     |  |     |
     |     |  |     |  |     |         /-no_species_info
     |     |  |   /-|  

**Something to watch out for**: It collapses nodes with `no_species_info`. 

Test 1: Do we get the same number of children, if we reload the tree and collapse branches if no subtree is retirved beforehand?

In [30]:
streps = Tree("../supplementary_file_10/strep/TBE.raxml.support")

for _ in streps:
    _.name = accession_members[_.name]

In [31]:
print(f"Before collapse, tree has {len(streps)} children")

collapsed_streps = Tree(streps.write(is_leaf_fn=collapse_branches_with_one_species))

print(f"After collapse, tree has {len(collapsed_streps)} children")

Before collapse, tree has 9049 children
After collapse, tree has 5199 children


What the subtree for *Streptomyces lilacinus* looks like now?

In [32]:
subtree_t2 = get_subtree(collapsed_streps, "Streptomyces lilacinus")
print(subtree_t2)


      /-no_species_info
     |
     |      /-Streptomyces sp. _2_ Streptomyces caeruleus _1_ Actinoalloteichus cyanogriseus _1_
     |     |
     |     |   /-Streptomyces hiroshimensis
     |     |  |
     |     |  |      /-Streptomyces hiroshimensis
     |     |  |     |
     |     |  |     |                  /-Streptomyces lacticiproducens
     |     |  |     |               /-|
     |     |  |     |            /-|   \-Streptomyces caatingaensis _COLLAPSED_
     |     |  |     |           |  |
     |     |  |     |         /-|   \-no_species_info
     |     |  |     |        |  |
     |     |  |     |      /-|   \-Streptomyces lacticiproducens
     |     |  |     |     |  |
     |     |  |     |   /-|   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |  |
     |     |  |     |  |   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |
     |     |  |     |  |      /-Streptomyces gamaensis _COLLAPSED_
     |     |  |     |  |     |
     |     |  |     |  |     |

When reloading the tree nodes with no_species_info are no longer treated as a species name.


Testing what would happen if we update `ignore_list` with no_species_info

In [33]:
streps = Tree("../supplementary_file_10/strep/TBE.raxml.support") # Loading tree
for _ in streps:
    _.name = accession_members[_.name]

In [34]:
subtree_t3 = get_subtree(streps, "Streptomyces lilacinus") # Getting subtree for S. lilacinus
print(subtree_t3)


      /-Streptomyces sp. _2_
     |
     |      /-Streptomyces sp. _2_ Streptomyces caeruleus _1_ Actinoalloteichus cyanogriseus _1_
     |     |
     |     |   /-Streptomyces hiroshimensis _3_
     |     |  |
     |     |  |      /-Streptomyces hiroshimensis _9_ Streptomyces sp. _3_
     |     |  |     |
     |     |  |     |                  /-Streptomyces lacticiproducens _3_ Streptomyces sp. _1_
     |     |  |     |               /-|
     |     |  |     |              |  |   /-Streptomyces caatingaensis _1_
     |     |  |     |            /-|   \-|
     |     |  |     |           |  |      \-Streptomyces caatingaensis _6_
     |     |  |     |         /-|  |
     |     |  |     |        |  |   \-Streptomyces sp. _2_
     |     |  |     |        |  |
     |     |  |     |      /-|   \-Streptomyces lacticiproducens _2_ Streptomyces sp. _1_
     |     |  |     |     |  |
     |     |  |     |     |  |   /-Streptomyces abikoensis _2_ Streptomyces sp. _1_
     |     |  |     |   /-| 

In [35]:
# make ignore_list with no_species_info
ignore_list = ("uncultured Streptomyces", "uncultured bacterium",
               "uncultured actinobacterium", "bacterium",
               "Streptomyces sp.", "synthetic construct",
               "Actinomycete", "Candidatus Streptomyces",
               "Streptomycetaceae SR", "uncultured Actinomycetales",
               "Bacillus sp.", "Streptomyces cf.",
               "Actinomycetales bacterium", "Streptomycetaceae bacterium", "rock porewater",
               "Actinobacteria bacterium", "Actinomyces sp.", "no_species_info")

In [36]:
print(f"Before collapse, tree has {len(streps)} children")

collapsed_streps = Tree(streps.write(is_leaf_fn=collapse_branches_with_one_species))

print(f"After collapse, tree has {len(collapsed_streps)} children")

Before collapse, tree has 9049 children
After collapse, tree has 5199 children


No the tree also has 5199 children. 

Let's see how *Streptomyces lilacinus* subtree looks like now.

In [37]:
subtree_t4 = get_subtree(collapsed_streps, "Streptomyces lilacinus") # Getting subtree for S. lilacinus
print(subtree_t4)


      /-no_species_info
     |
     |      /-Streptomyces sp. _2_ Streptomyces caeruleus _1_ Actinoalloteichus cyanogriseus _1_
     |     |
     |     |   /-Streptomyces hiroshimensis
     |     |  |
     |     |  |      /-Streptomyces hiroshimensis
     |     |  |     |
     |     |  |     |                  /-Streptomyces lacticiproducens
     |     |  |     |               /-|
     |     |  |     |            /-|   \-Streptomyces caatingaensis _COLLAPSED_
     |     |  |     |           |  |
     |     |  |     |         /-|   \-no_species_info
     |     |  |     |        |  |
     |     |  |     |      /-|   \-Streptomyces lacticiproducens
     |     |  |     |     |  |
     |     |  |     |   /-|   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |  |
     |     |  |     |  |   \-Streptomyces abikoensis _COLLAPSED_
     |     |  |     |  |
     |     |  |     |  |      /-Streptomyces gamaensis _COLLAPSED_
     |     |  |     |  |     |
     |     |  |     |  |     |

Writing the collapsed tree to a Newick file.

For tree anntation in R, each node needs to be unique, and must not contain space.
Therefore, before saving the tree as a Newick file, we first must change the leaf names.

In [38]:
ids = (["%#06d" % num for num in range(0, len(collapsed_streps))])

In [39]:
counter = 0
for node in collapsed_streps:
    node.name = node.name.replace(" ", "_") + '_id' + str("%#06d" % counter)
    counter += 1
    print(node.name)

Streptomyces_xiamenensis_id000000
no_species_info_id000001
Streptomyces_xiamenensis__COLLAPSED__id000002
Streptomyces_xiamenensis__COLLAPSED__id000003
no_species_info_id000004
no_species_info_id000005
Streptomyces_carpaticus__COLLAPSED__id000006
Streptomyces_carpaticus__COLLAPSED__id000007
bacterium_IM_8A__COLLAPSED__id000008
Streptomyces_carpaticus__COLLAPSED__id000009
no_species_info_id000010
no_species_info_id000011
Streptomyces_carpaticus__COLLAPSED__id000012
no_species_info_id000013
actinobacterium_HMJKDS1__COLLAPSED__id000014
no_species_info_id000015
unidentified__COLLAPSED__id000016
uncultured_soil_id000017
Streptomyces_harbinensis_id000018
no_species_info_id000019
Streptomyces_carpaticus_id000020
no_species_info_id000021
streptomyces_sp.__COLLAPSED__id000022
Streptomyces_carpaticus_id000023
no_species_info_id000024
Streptomyces_carpaticus_id000025
Streptomyces_cheonanensis_id000026
Streptomyces_carpaticus_id000027
no_species_info_id000028
actinobacterium_HMYKDS15_id000029
Strep

Streptomyces_hiroshimensis__COLLAPSED__id001734
Streptomyces_cinnamoneus__COLLAPSED__id001735
Streptomyces_cinnamoneus__8__Streptomyces_sp.__4__Streptomyces_cinnamonensis__1__id001736
Streptomyces_pseudoechinosporeus_id001737
no_species_info_id001738
Actinoalloteichus_cyanogriseus_id001739
Streptomyces_hiroshimensis__2__Streptomyces_roseoverticillatus__1__Streptomyces_sp.__1__id001740
Streptomyces_hiroshimensis_id001741
Streptomyces_thioluteus_id001742
Streptomyces_hiroshimensis_id001743
Streptomyces_morookaense_id001744
Streptomyces_morookaense__COLLAPSED__id001745
Streptoverticillium_reticulum_id001746
Streptomyces_lavenduligriseus_id001747
Streptomyces_thioluteus__COLLAPSED__id001748
Streptomyces_variabilis_id001749
Streptomyces_olivoverticillatus__2__Streptomyces_viridiflavus__1__Streptomyces_sp.__1__id001750
no_species_info_id001751
Streptomyces_hiroshimensis_id001752
Streptomyces_griseocarneus__6__Streptomyces_sp.__2__Streptomyces_septatus__2__id001753
Streptomyces_sp.__2__Strept

Streptomyces_guanduensis__COLLAPSED__id002233
Streptomyces_yeochonensis_id002234
Streptomyces_platensis__COLLAPSED__id002235
Streptomyces_rubidus__COLLAPSED__id002236
Streptomyces_ferralitis__COLLAPSED__id002237
no_species_info_id002238
no_species_info_id002239
Streptomyces_cocklensis_id002240
Streptomyces_bryophytorum__COLLAPSED__id002241
Streptomyces_cocklensis__COLLAPSED__id002242
no_species_info_id002243
Streptomyces_paucisporeus__COLLAPSED__id002244
Streptomyces_cocklensis__COLLAPSED__id002245
no_species_info_id002246
Streptomyces_bryophytorum__COLLAPSED__id002247
Streptomyces_ferralitis__COLLAPSED__id002248
no_species_info_id002249
Streptomyces_griseoplanus_id002250
Streptomyces_griseoplanus__COLLAPSED__id002251
Streptacidiphilus_sp._id002252
Kitasatospora_sp._id002253
Streptomyces_griseoplanus_id002254
Streptomyces_griseoplanus_id002255
no_species_info_id002256
Streptomyces_alkaliphilus__COLLAPSED__id002257
Streptomyces_uncialis__COLLAPSED__id002258
Streptomyces_calidiresistens_

Streptomyces_xanthocidicus_id003733
Streptomyces_chrysomallus__COLLAPSED__id003734
Streptomyces_nigrogriseolus_id003735
Streptomyces_citricolor__COLLAPSED__id003736
Streptomyces_phosalacineus_id003737
Streptomyces_aureofaciens_id003738
Streptomyces_kaniharaensis_id003739
Kitasatospora_kepongensis_id003740
Streptomyces_kaniharaensis__2__Streptomyces_xanthocidicus__1__id003741
Streptomyces_xanthocidicus_id003742
Streptomyces_kaniharaensis_id003743
no_species_info_id003744
no_species_info_id003745
Streptomyces_aburaviensis__COLLAPSED__id003746
Streptomyces_lactacystinicus__COLLAPSED__id003747
Streptomyces_aureofaciens_id003748
Streptomyces_aureofaciens_id003749
unidentified__COLLAPSED__id003750
Streptomyces_avellaneus_id003751
Streptomyces_aureofaciens_id003752
Streptomyces_viridifaciens__7__Kitasatospora_aureofaciens__2__id003753
no_species_info_id003754
Streptomyces_varius__COLLAPSED__id003755
Streptomyces_sayamaensis_id003756
Streptomyces_psammoticus__COLLAPSED__id003757
no_species_inf

Streptomyces_flavovirens__COLLAPSED__id004232
Streptomyces_piomogenus_id004233
Streptomyces_sanglieri_id004234
no_species_info_id004235
no_species_info_id004236
no_species_info_id004237
Streptomyces_sanglieri_id004238
Streptomyces_atratus_id004239
Streptomyces_pulveraceus_id004240
Streptomyces_sanglieri_id004241
Streptomyces_sanglieri__COLLAPSED__id004242
Streptomyces_sanglieri_id004243
no_species_info_id004244
Streptomyces_drozdowiczii__COLLAPSED__id004245
Streptomyces_drozdowiczii__COLLAPSED__id004246
no_species_info_id004247
no_species_info_id004248
Streptomyces_laculatispora__COLLAPSED__id004249
Streptomyces_brevispora_id004250
no_species_info_id004251
no_species_info_id004252
uncultured_actinomycete__COLLAPSED__id004253
Streptomyces_drozdowiczii_id004254
no_species_info_id004255
Streptomyces_beijiangensis__COLLAPSED__id004256
Streptomyces_brevispora__3__Streptomyces_beijiangensis__4__Streptomyces_sp.__2__Streptomyces_avidinii__1__id004257
Streptomyces_beijiangensis_id004258
Strept

In [40]:
collapsed_streps.write(format=2, outfile="collapsed_strep_tbe.new")

# Getting annotation data

In [41]:
df = pd.DataFrame([_.name for _ in collapsed_streps], columns =['Name'])

In [42]:
all_types_dispersions = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_clavuligerus_', '_scabiei_', '_griseus_', '_lydicus_'] if k in _.name}
df['representative_disperions'] = df['Name'].map(all_types_dispersions)

In [43]:
disperse = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_griseus_', '_albus_'] if k in _.name}
df['disperse'] = df['Name'].map(disperse)

In [44]:
no_dispersion = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_albulus_', '_venezuelae_', '_lydicus_'] if k in _.name}
df['no_dispersion'] = df['Name'].map(no_dispersion)

In [45]:
no_data = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_clavuligerus_', '_coelicolor_'] if k in _.name}
df['no_data'] = df['Name'].map(no_data)

In [46]:
slight_dispersion = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_lavendulae_', '_rimosus_', '_scabiei_'] if k in _.name}
df['slight_dispersion'] = df['Name'].map(slight_dispersion)

In [47]:
Actinacidiphila = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_bryophytorum_', '_paucisporeus_', '_alni_', '_rubidus_', '_yeochonensis_', '_guanduensis_', '_yanglinensis_', '_acididurans_', '_soli_', '_oryziradicis_', '_glauciniger_'] if k in _.name}
df['Actinacidiphila'] = df['Name'].map(Actinacidiphila)

In [48]:
Phaeacidiphilus = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_oryzae_'] if k in _.name}
df['Phaeacidiphilus'] = df['Name'].map(Phaeacidiphilus)

In [49]:
Mangrovactinospora = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_gilvigriseus_'] if k in _.name}
df['Mangrovactinospora'] = df['Name'].map(Mangrovactinospora)

In [50]:
Wenjunlia = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_vitaminophilus_', '_tyrosinilyticus_'] if k in _.name}
df['Wenjunlia'] = df['Name'].map(Wenjunlia)

In [51]:
Streptantibioticus = {_.name:k.replace('_', '') for _ in collapsed_streps for k in ['_cattleya_'] if k in _.name}
df['Streptantibioticus'] = df['Name'].map(Streptantibioticus)

In [52]:
df = df.fillna('NA')

In [53]:
df.to_csv("phylogenetic_tree_data_annotation.csv", index=False)