## Tree generator for tissueMASST

As a taxonomic tree cannot be generated, a different approach has been adopted. This tool is designed to help researchers identify whether known or unknown MS/MS spectra have been observed in specific biofluids or tissues, and if so, in relation to which diseases. The first layer represents bio-localization, the second layer disease-assosciated status, the third the organism in which was found (data was restricted to Homo sapiens, Mus musculus, and Rattus norvegicus), the fourth layer sex, and the fifth age. Tree relies on the metadata information available via PanReDU. 

In [1]:
import pandas as pd
import numpy as np
import json

Read in table originated from ReDU metadata and curated in R

In [2]:
data = pd.read_csv("tissue_masst_table.csv")

Extract vectors of unique terms

In [3]:
Biolocalization_array = data['UBERONBodyPartName'].unique()
Disease_array = data['DOIDCommonName'].unique()
Taxa_array = data['NCBITaxonomy'].unique()
Sex_array = data['BiologicalSex'].unique()
Age_array = data['LifeStage'].unique()

# Convert arrays to vectors
Biolocalization_vector = Biolocalization_array.flatten()
Disease_vector = Disease_array.flatten()
Taxa_vector = Taxa_array.flatten()
Sex_vector = Sex_array.flatten()
Age_vector = Age_array.flatten()

Create dictionary and sequentially append terms

In [4]:
# Define the dictionary

tree = {
    "ID": "Root",
    "duplication": "Y",
    "type": "node",
    "name": "Root",
    "children": []
}

# Loop through each Biolocalization
for loc in Biolocalization_vector:
    # Create a dictionary for the Biolocalization
    loc_dict = {
        "ID": loc,
        "duplication": "Y",
        "type": "node",
        "name": loc,
        "children": []
    }
    # Loop through each Disease
    for disease in Disease_vector:
        # Create a dictionary for the Disease
        disease_dict = {
            "ID": loc + "_" + disease,
            "duplication": "Y",
            "type": "node",
            "name": disease,
            "children": []
        }
        # Loop through each Taxa
        for taxa in Taxa_vector:
            # Create a dictionary for the Taxa
            taxa_dict = {
                "ID": loc + "_" + disease + "_" + taxa,
                "duplication": "Y",
                "type": "node",
                "name": taxa,
                "children": []
            }
            # Loop through each Sex
            for sex in Sex_vector:
                # Create a dictionary for the Sex
                sex_dict = {
                    "ID": loc + "_" + disease + "_" + taxa + "_" + sex,
                    "duplication": "Y",
                    "type": "node",
                    "name": sex,
                    "children": []
                }
                # Loop through each Age
                for age in Age_vector:
                    age_dict = {
                        "ID": loc + "_" + disease + "_" + taxa + "_" + sex + "_" + age,
                        "duplication": "Y",
                        "type": "leaf",
                        "name": age
                    }
                    # Append the Age dictionary to the Sex dictionary
                    sex_dict['children'].append(age_dict)
                # Append the Sex dictionary to the Taxa dictionary
                taxa_dict['children'].append(sex_dict)
            # Append the Taxa dictionary to the Disease dictionary
            disease_dict['children'].append(taxa_dict)
        # Append the Disease dictionary to the Biolocalization dictionary
        loc_dict['children'].append(disease_dict)
    # Append the Biolocalization dictionary to the Root
    tree['children'].append(loc_dict)

# Convert the dictionary to JSON
json_str = json.dumps(tree, indent=5)

In [6]:
# Save tree file
with open('tissue_masst_tree_initial.json', 'w') as outfile:
    outfile.write(json_str)

Add number of available files 

In [7]:
# Read df with all files available for globalMASST
table_tissue = pd.read_csv('tissue_masst_id_count.csv', encoding = 'latin-1', dtype = 'str')

In [8]:
# Make a dictionary
numfiles_dict = pd.Series(table_tissue['count'].values, index = table_tissue['ID']).to_dict()

In [9]:
# Function to add numer of available files
def filenum_tree(node, numfile_dict, key = "ID"):
    value = node.get(key)
    number_files = numfiles_dict.get(value)
    if number_files is not None :
        node['group_size'] = number_files
    children = node.get("children")
    if children is not None :
        for child in children :
            filenum_tree(child, numfiles_dict, key)

In [10]:
# Append number of files
filenum_tree(tree, numfiles_dict)

In [11]:
# Save final tree file
with open('tissue_masst_tree_count.json', 'w') as outfile:
    outfile.write(json.dumps(tree, indent = 5))

The generate JSON is very big because all possible combinations are represented.
Trim it in roder to keep only childrens that are actually present in the metadata file.

In [12]:
def filter_json(data):
    if isinstance(data, dict):
        children = []
        for child in data.get("children", []):
            filtered_child = filter_json(child)
            if filtered_child is not None:
                children.append(filtered_child)
        data["children"] = children
        if data.get("type") == "leaf" and "group_size" not in data:
            return None
        else:
            return data
    else:
        return None


In [13]:
# Filter the JSON object recursively
filtered_data = filter_json(tree)

In [14]:
# Save final tree file
with open('tissue_masst_tree_final.json', 'w') as outfile:
    outfile.write(json.dumps(filtered_data, indent = 5))

Remove children without children or any samples

In [15]:
def filter_json_children(data):
    if isinstance(data, dict):
        children = []
        for child in data.get("children", []):
            filtered_child = filter_json_children(child)
            if filtered_child is not None:
                children.append(filtered_child)
        data["children"] = children
        if data.get("type") == "node" and ("group_size" not in data and not data["children"]):
            return None
        else:
            return data
    else:
        return None


In [16]:
# Filter the JSON object recursively
filtered_data1 = filter_json_children(filtered_data)

In [17]:
# Save final tree file
with open('tissue_masst_tree.json', 'w') as outfile:
    outfile.write(json.dumps(filtered_data1, indent = 5))