In [20]:
from openbabel import openbabel

In [2]:
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [4]:
from treelib import Node, Tree

In [5]:
tree = Tree()

In [6]:
class Molecule(object):
    def __init__(self, path):
        self.path = path

In [7]:
tree.create_node('H', 'H', data=Molecule(''))

Node(tag=H, identifier=H, data=<__main__.Molecule object at 0x1110b0a00>)

In [8]:
tree.create_node('F', 'F', parent='H', data=Molecule('H'))

Node(tag=F, identifier=F, data=<__main__.Molecule object at 0x111182d90>)

In [9]:
tree.create_node('A', 'A', parent='H', data=Molecule('H'))

Node(tag=A, identifier=A, data=<__main__.Molecule object at 0x14802a3d0>)

In [10]:
tree.create_node('G', 'G', parent='H', data=Molecule('H'))

Node(tag=G, identifier=G, data=<__main__.Molecule object at 0x14802a1f0>)

In [11]:
tree.show()

H
├── A
├── F
└── G



In [12]:
tree.show(data_property="path")


├── H
├── H
└── H



In [13]:
node = tree.get_node('G')

In [14]:
type(node.data.path) == str

True

In [19]:
tree.depth(b[0])

0

In [44]:
def split(word):
    return[char for char in word]

In [45]:
def index_finder(Product, rels, path):
    index = 0
    for i in range(len(rels['Reagent'])):
        found_index = False
        if rels['Reagent'][i] == Product and rels['Formed/Produced'][i] == 1:
            valid = True
            precursors = precursor_finder(rels['Index'][i], rels)
            print(rels['Index'][i])
            print(precursors)
            for j in range(len(precursors)):
                if precursors[j] in path:
                    valid = False
                    break
            if valid == True:
                found_index = True
        if found_index == True:
            index = rels['Index'][i]
            return(index)
            break

In [46]:
def precursor_finder(index, rels):
    precursors = []
    for i in range(len(rels['Index'])):
        if rels['Index'][i] == index and rels['Formed/Produced'][i] == -1:
            precursors.append(rels['Reagent'][i])
    return(precursors)

In [61]:
def find_component(node, rels):
    Product = node.tag
    path = (node.data).path
    index = index_finder(Product, rels, path)
    precursors = precursor_finder(index, rels)
    return(precursors)

In [62]:
def map_tree(Smiles, rels):
    rels = pd.read_csv(rels, sep='\t')
    base_molecules = ['N', 'C=O', 'C(CO)=O', 'O'] #FormoseAmm
    #base_molecules = ['C(C(C)=O)(O)=O', 'O'] #PyruvicAcid
    tree = Tree()
    tree.create_node(Smiles, 0, data=Molecule([Smiles]))
    finished = False
    level_counter = 0
    node_counter = 0
    while finished == False:
        print(level_counter)
        nodes = tree.all_nodes()
        active_nodes = []
        for i in range(len(nodes)):
            tag = nodes[i].tag
            level = tree.depth(nodes[i])
            if level == level_counter and tag not in base_molecules:
                active_nodes.append(nodes[i])
        if active_nodes == []:
            finished = True
            break
        else:
            for i in range(len(active_nodes)):
                precursors = find_component(active_nodes[i], rels)
                product = active_nodes[i].identifier
                dummy = (active_nodes[i].data).path
                place = []
                for k in range(len(dummy)):
                    place.append(dummy[k])
                for j in range(len(precursors)):
                    node_counter +=1
                    tree.create_node(precursors[j], node_counter, parent=product, data=Molecule(place + [precursors[j]]))
            level_counter+=1
    return(tree)

In [66]:
%%time
a = map_tree('C(C(C(C)NCCC(C=O)=O)O)=O', './Rels/FormoseAmm_rels.tsv')

0
43902_0
['C=CC(C=O)=O', 'C(C(C(C)N)O)=O']
1
1867_0
['C(C(CCO)=O)=O']
2193_0
['C(CN)(O)=O', 'C(C(C(C)=O)O)=O']
2
90_0
['C(C(C(CO)O)O)=O']
63_0
['C=O', 'O', 'C(CN)=O']
96_0
['C(C(C(CO)O)O)=O']
3
7_0
['C(CO)=O', 'C(CO)=O']
10_0
['N', 'C(CO)=O']
7_0
['C(CO)=O', 'C(CO)=O']
4
CPU times: user 44 s, sys: 125 ms, total: 44.2 s
Wall time: 44.3 s


In [67]:
a.show()

C(C(C(C)NCCC(C=O)=O)O)=O
├── C(C(C(C)N)O)=O
│   ├── C(C(C(C)=O)O)=O
│   │   └── C(C(C(CO)O)O)=O
│   │       ├── C(CO)=O
│   │       └── C(CO)=O
│   └── C(CN)(O)=O
│       ├── C(CN)=O
│       │   ├── C(CO)=O
│       │   └── N
│       ├── C=O
│       └── O
└── C=CC(C=O)=O
    └── C(C(CCO)=O)=O
        └── C(C(C(CO)O)O)=O
            ├── C(CO)=O
            └── C(CO)=O



In [68]:
a.show(data_property='path')

['C(C(C(C)NCCC(C=O)=O)O)=O']
├── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O']
│   ├── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(C(C(C)=O)O)=O']
│   │   └── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(C(C(C)=O)O)=O', 'C(C(C(CO)O)O)=O']
│   │       ├── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(C(C(C)=O)O)=O', 'C(C(C(CO)O)O)=O', 'C(CO)=O']
│   │       └── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(C(C(C)=O)O)=O', 'C(C(C(CO)O)O)=O', 'C(CO)=O']
│   └── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(CN)(O)=O']
│       ├── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(CN)(O)=O', 'C(CN)=O']
│       │   ├── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(CN)(O)=O', 'C(CN)=O', 'C(CO)=O']
│       │   └── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(CN)(O)=O', 'C(CN)=O', 'N']
│       ├── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(CN)(O)=O', 'C=O']
│       └── ['C(C(C(C)NCCC(C=O)=O)O)=O', 'C(C(C(C)N)O)=O', 'C(CN)(O)=O', 'O']
└── ['C(C(C(C)NCCC(C=O