# Tree generator for plantMASST 

The following code is used to generate the final JSON file used for tree visualisation in plantMASST.

The tree is generate using ETE3 Toolkit (http://etetoolkit.org/download/)

In [None]:
# Import required packages
from ete3 import PhyloTree, NCBITaxa, Tree
import pandas as pd
import json, random, os

In [None]:
# Define working directory
os.chdir('/Users/simonezuffa/PycharmProjects/microbe_masst/trees/plant_masst_tree')

Import list of NCBI IDs extracted from lineages obtained from plantmasst_tree_generator.R

In [None]:
df = pd.read_csv('list_ncbi_plantmasst.csv')

In [None]:
id_list = df['id']

In [None]:
ncbi = NCBITaxa()
#ncbi.update_taxonomy_database() run this line if NCBI IDs are missing downstream

Generate a raw tree and save it

In [None]:
tree = ncbi.get_topology(id_list)
tree.write(format = 1, outfile = "plantmasst_tree_raw.nw")

Read the raw tree and extract extra nodes

In [None]:
plant_tree = Tree("plantmasst_tree_raw.nw", format = 1)

Convert tree into JSON 

In [None]:
# Function to convert ete3 tree to json. 
# Code adapted from https://gist.github.com/jhcepas/9205262
def get_json(node):
    # Read ETE tag for duplication or speciation events
    if not hasattr(node, 'evoltype'):
        dup = random.sample(['N','Y'], 1)[0]
    elif node.evoltype == "S":
        dup = "N"
    elif node.evoltype == "D":
        dup = "Y"

    node.name = node.name.replace("'", '')

    json = { "NCBI": node.name,
             "duplication": dup,
             "type": "node" if node.children else "leaf",
             }
    if node.children:
        json["children"] = []
        for ch in node.children:
            json["children"].append(get_json(ch))
    return json

In [None]:
# Transform newick tree into json format
plant_tree_json = get_json(plant_tree)

In [None]:
# Save raw tree file
with open('plantmasst_tree_raw.json', 'w') as outfile:
    outfile.write(json.dumps(plant_tree_json, indent = 5))

Check extra nodes present in the tree generated with ete3. These will have to be removed

In [None]:
# Function to extract all node names
def traverse_tree(value_list, node, key):
    value = node.get(key)
    if value is not None:
        value_list.append(value)
    children = node.get("children")
    if children is not None:
        for child in children:
            traverse_tree(value_list, child, key)

    return value_list

In [None]:
# Extract all the nodes names
names = traverse_tree([], plant_tree_json, "NCBI")
names.sort()
dataframe_name = pd.DataFrame(names)
dataframe_name.rename(columns = {0:'NCBI'}, inplace = True)
dataframe_name.to_csv('Extracted_nodes_plant.csv')

Remove extra nodes that have been generated by ETE3 (make sure csv file is fixed - one column with no NAs)
Check plantmasst_tree_generator.R to generate nodes_to_remove.csv

In [None]:
nodes_to_remove = pd.read_csv('nodes_to_remove.csv', dtype = str)['NCBI']

In [None]:
for k in nodes_to_remove :
    i = tree.search_nodes(name = k)[0]
    i.delete()

Save trimmed tree

In [None]:
tree.write(format = 1, outfile = "plantmasst_tree_trimmed.nw")

Read in trimmed tree and convert it to a JSON

In [None]:
tree_trimmed = Tree("plantmasst_tree_trimmed.nw", format = 1)
plant_tree_json = get_json(tree_trimmed)

In [None]:
# Create a dictionary from the intial df
lineage_dict = pd.Series(df['name'].values, index = df['id']).to_dict()
lineage_dict = {str(k):str(v) for k,v in lineage_dict.items()}

In [None]:
# Function to add NCBI IDs to each node
def append_tree(node, lineage_dict, key = "NCBI"):
    value = node.get(key)
    taxonID = lineage_dict.get(value)
    if taxonID is not None  :
        node['name'] = taxonID
    children = node.get("children")
    if children is not None:
        for child in children:
            append_tree(child, lineage_dict, key)

In [None]:
# Append NCBI IDs to all nodes
append_tree(plant_tree_json, lineage_dict)

In [None]:
# Add nodes for Blanks and QCs to the tree
Blank = {
    "name": "Blank",
    "NCBI": "Blank"
 }

plant_tree_json['children'].append(Blank)

QC = {
    "name": "QC",
    "NCBI": "QC"
 }

plant_tree_json['children'].append(QC)

Add number of files available for each node

In [None]:
# Read df with all files available for microbeMASST
table_plantMASST = pd.read_csv('plant_masst_table.csv', encoding = 'latin-1', dtype = 'str')

In [None]:
# Count how many files are available for each NCBI ID
available_files = table_plantMASST['Taxa_NCBI'].value_counts()
available_files = available_files.rename_axis("NCBI_ID").reset_index()
available_files = available_files.rename(columns={"Taxa_NCBI": "File_available"})

In [None]:
# Make a dictionary
numfiles_dict = pd.Series(available_files['File_available'].values, index = available_files['NCBI_ID']).to_dict()

In [None]:
# Function to add numer of available files
def filenum_tree(node, numfile_dict, key = "NCBI"):
    value = node.get(key)
    number_files = numfiles_dict.get(value)
    if number_files is not None :
        node['group_size'] = number_files
    children = node.get("children")
    if children is not None :
        for child in children :
            filenum_tree(child, numfiles_dict, key)

In [None]:
# Append number of files
filenum_tree(plant_tree_json, numfiles_dict)

In [None]:
# Make a dictionary for ranks 
rank_dict = pd.Series(df['rank'].values, index = df['id']).to_dict()
rank_dict = {str(k):str(v) for k,v in rank_dict.items()}

In [None]:
# Function to add taxonomic level to nodes
def append_rank(node, rank_dict, key = "NCBI"):
    value = node.get(key)
    rankID = rank_dict.get(value)
    if rankID is not None  :
        node['Rank'] = rankID
    children = node.get("children")
    if children is not None:
        for child in children:
            append_rank(child, rank_dict, key)

In [None]:
# Append taxonomy ranks
append_rank(plant_tree_json, rank_dict)

In [None]:
# Save final tree file
with open('plant_masst_tree.json', 'w') as outfile:
    outfile.write(json.dumps(plant_tree_json, indent = 5))

Manually add Root to the  - "NCBI": "131567" and "name": "Root"