In [13]:
import dendropy
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import pandas as pd
import seaborn as sns

sys.path.append("/Users/sarahfong/tools/py_")
import config_readwrite as crw
import plot_params as pp
pp.fonts()
os.chdir("/Users/sarahfong/wynton_git/mpra_nullomer_crispr/")
name = "local_config.ini"
config, cfn = crw.read_config(name)

section="PHYLO"
NEWICK = config[section]["newick"]
RE = config[section]["results"]

if os.path.exists(RE) is False:
    os.mkdir(RE)

In [11]:
dtree=dendropy.Tree.get(path=NEWICK, schema="newick")

print(dtree.as_ascii_plot())

                                                         /----- single-null-344
                                   /---------------------+                     
                                   |                     |  /-- single-null-123
                                   |                     \--+                  
                                   |                        \-- single-null-920
                                   |                                           
                                   |                /---------- single-null-940
                         /---------+ /--------------+                          
                         |         | |              |  /------- single-null-32 
                         |         | |              \--+                       
                         |         | |                 | /----- single-null-394
                         |         | |                 \-+                     
                         |         | |  

# tree pruning

In [33]:
def getNodesPruneTree(tree, min_):
    """
    select ids to keep where edge length is longer than some minimum edge length.
    
    input
        tree (dendropy tree obj) - tree to parse through
        min_ (float) - minimum edge length to prune from. 
        
    method
        1. make empty lists 
            (1) ids - to collect nodes w/ edge lengths long enough to keep, 
            (2)  too_short - a temporary list with ids that are closely related. This gets reset often. 
        2. parse through nodes in tree
        3. prune from external leafs, not internal nodes (internal node taxon = None), count the leafs
        4. keep node taxon if node edge length is greater than min edge length
        5. assess if too_short list has any taxa (too_short length will be greater than one)
            5.1 If so, randomly pick one id from the too short list
        6. reset the too_short list for the next subtree w short edge lengths
        
        7. if subtree has edge lengths shorter than min edge length, add the taxon to the too_short list. 
            One of the edges will be randomly selected in step 5.1 or step 8. 
        8. If last leaf on the tree and edge length is too short, randomly select one of the leaves
    
    return
        len(ids) (int) - count of ids that meet minimum edge length requirements. 
        ids (list) - list of the ids that meet min edge len reqs. 
    
    """
    
    #1
    ids, too_short = [], []
    leaf_count=0
    print("pruning branches less than", min_)
    
    #2
    for nd in dtree:
        
        # 3
        if nd.taxon !=None:  # lots of internal nodes named none
            leaf_count+=1
            #4
            if float(nd.edge_length) >= float(min_):  # keep trees w/ edge len greater than min edge length.  
                ids.append(nd.taxon) # retain the taxon.
            
                #5
                if len(too_short) >1:

                    #5.1
                    ids.append(list(np.random.choice(too_short, size=1))[0])
                #6
                too_short = [] # reset the too short
        
            #7
            elif float(nd.edge_length) < float(min_):  # else, the edges are short 
            
                # add to the too short list. One taxon will be randomly selected in step 5.1 or 8
                too_short.append(nd.taxon)
            
        #8 if this is the last leaf on the tree
        if leaf_count ==tree.__len__() and len(too_short)>1:
            ids.append(list(np.random.choice(too_short, size=1))[0])
            
            
    return len(ids), ids

In [34]:
# traverse tree to find the min number of leaves. 
for min_ in np.arange(0.01,1, 0.01):

    count, ids = getNodesPruneTree(dtree, min_)
    print(min_, len(ids))
    if count<260:
        print("done!", min_, len(ids))
        break

pruning branches less than 0.01
0.01 1000
pruning branches less than 0.02
0.02 1000
pruning branches less than 0.03
0.03 1000
pruning branches less than 0.04
0.04 999
pruning branches less than 0.05
0.05 999
pruning branches less than 0.060000000000000005
0.060000000000000005 999
pruning branches less than 0.06999999999999999
0.06999999999999999 959
pruning branches less than 0.08
0.08 959
pruning branches less than 0.09
0.09 956
pruning branches less than 0.09999999999999999
0.09999999999999999 956
pruning branches less than 0.11
0.11 864
pruning branches less than 0.12
0.12 857
pruning branches less than 0.13
0.13 857
pruning branches less than 0.14
0.14 710
pruning branches less than 0.15000000000000002
0.15000000000000002 680
pruning branches less than 0.16
0.16 674
pruning branches less than 0.17
0.17 561
pruning branches less than 0.18000000000000002
0.18000000000000002 549
pruning branches less than 0.19
0.19 511
pruning branches less than 0.2
0.2 494
pruning branches less than 

In [35]:
len(ids)

259

In [37]:
dtree.retain_taxa(ids)
print(#dtree.as_string(schema='newick'), 
      dtree.__len__())
print(dtree.as_ascii_plot())

259
                                     /------------------------- single-null-344
                             /-------+                                         
                             |       |  /---------------------- single-null-943
                             |       \--+                                      
                             |          |   /------------------ single-null-148
                             |          \---+                                  
                             |              |   /-------------- single-null-114
                             |              \---+                              
                             |                  |  /----------- single-null-936
                             |                  \--+                           
                             |                     |   /------- single-null-275
                             |                     \---+                       
                             |      

In [None]:
ids

# phylotrees

In [None]:
 trees = Phylo.read(NEWICK,"newick")

In [None]:
last_length=0
keep = []
for i in trees.get_terminals():
    
    # if branches are the same length
    if i.branch_length == last_length:
        continue
    #. if branches are not the same length, keep 
    else:
        keep.append((i.branch_length, i.name))
        print(i.branch_length, i.name)
        last_length = i.branch_length

# view tree as text

In [None]:
cmd = ['ete3', 
      "view --text -t",
       NEWICK
      ]
print(" ".join(cmd))

# view tree as svg

In [None]:
from IPython.display import SVG

In [None]:
tree = "/Users/sarahfong/tree.svg"
#SVG(filename=tree)  

# load tree

In [18]:
from ete3 import PhyloTree
tree = PhyloTree(NEWICK)

ModuleNotFoundError: No module named 'ete3'

In [None]:
%%bash
conda list