In [1]:
from openbabel import openbabel

In [2]:
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [4]:
#pip install treelib

In [5]:
from treelib import Node, Tree

In [6]:
tree = Tree()

In [7]:
class Molecule(object):
    def __init__(self, path):
        self.path = path

In [8]:
tree.create_node('H', 'H', data=Molecule(''))

Node(tag=H, identifier=H, data=<__main__.Molecule object at 0x10374c7f0>)

In [9]:
tree.create_node('F', 'F', parent='H', data=Molecule('H'))

Node(tag=F, identifier=F, data=<__main__.Molecule object at 0x13a683340>)

In [10]:
tree.create_node('A', 'A', parent='H', data=Molecule('H'))

Node(tag=A, identifier=A, data=<__main__.Molecule object at 0x13a6830a0>)

In [11]:
tree.create_node('G', 'G', parent='H', data=Molecule('H'))

Node(tag=G, identifier=G, data=<__main__.Molecule object at 0x13a6831f0>)

In [12]:
tree.show()

H
├── A
├── F
└── G



In [13]:
tree.show(data_property="path")


├── H
├── H
└── H



In [14]:
node = tree.get_node('G')

In [393]:
type(node.data.path) == str

True

In [43]:
b = tree.all_nodes()

In [129]:
b[0]

Node(tag=H, identifier=H, data=<__main__.Molecule object at 0x13a7bfb80>)

In [280]:
def split(word):
    return[char for char in word]

In [339]:
def index_finder(Product, rels, path):
    index = 0
    for i in range(len(rels['Reagent'])):
        found_index = False
        if rels['Reagent'][i] == Product and rels['Formed/Produced'][i] == 1:
            valid = True
            precursors = precursor_finder(rels['Index'][i], rels)
            print(rels['Index'][i])
            print(precursors)
            for j in range(len(precursors)):
                if precursors[j] in path:
                    valid = False
                    break
            if valid == True:
                found_index = True
        if found_index == True:
            index = rels['Index'][i]
            return(index)
            break

In [340]:
def precursor_finder(index, rels):
    precursors = []
    for i in range(len(rels['Index'])):
        if rels['Index'][i] == index and rels['Formed/Produced'][i] == -1:
            precursors.append(rels['Reagent'][i])
    return(precursors)

In [347]:
def find_component(node, rels):
    Product = node.tag
    path = (node.data).path
    index = index_finder(Product, rels, path)
    precursors = precursor_finder(index, rels)
    return(precursors)

In [433]:
def map_tree(Smiles, rels):
    rels = pd.read_csv(rels, sep='\t')
    #base_molecules = ['N', 'C=O', 'C(CO)=O', 'O'] #FormoseAmm
    base_molecules = ['C(C(C)=O)(O)=O', 'O'] #PyruvicAcid
    tree = Tree()
    tree.create_node(Smiles, 0, data=Molecule([Smiles]))
    finished = False
    level_counter = 0
    node_counter = 0
    while finished == False:
        print(level_counter)
        nodes = tree.all_nodes()
        active_nodes = []
        for i in range(len(nodes)):
            tag = nodes[i].tag
            level = tree.depth(nodes[i])
            if level == level_counter and tag not in base_molecules:
                active_nodes.append(nodes[i])
        if active_nodes == []:
            finished = True
            break
        else:
            for i in range(len(active_nodes)):
                precursors = find_component(active_nodes[i], rels)
                product = active_nodes[i].identifier
                dummy = (active_nodes[i].data).path
                place = []
                for k in range(len(dummy)):
                    place.append(dummy[k])
                for j in range(len(precursors)):
                    node_counter +=1
                    tree.create_node(precursors[j], node_counter, parent=product, data=Molecule(place + [precursors[j]]))
            level_counter+=1
    return(tree)

In [437]:
%%time
a = map_tree('C=CC(C(C=O)=CC(O)=O)(C)O', './PyruvicAcidData/PyruvicAcid_rels.tsv')

0
178666_0
['C(C(C)C(C=O)C(C=C)(C)O)(O)=O']
1
23199_0
['C(C=C(C(O)=O)C)=O', 'C=CC(C)O']
2
259_0
['C(C(C)(CC=O)O)(O)=O']
1935_0
['CC(CCO)O']
3
16_0
['C(C(C)=O)(O)=O', 'C(C)=O']
180_0
['O', 'C(CC(C)O)=O', 'C=O']
4
8_0
['C(C(C)=O)(O)=O']
10_0
['C(C)=O', 'C(C)=O']
32_0
['C(C(CO)O)(O)=O']
5
8_0
['C(C(C)=O)(O)=O']
8_0
['C(C(C)=O)(O)=O']
5_0
['C(C(C)=O)(O)=O', 'O']
6
CPU times: user 48.9 s, sys: 141 ms, total: 49 s
Wall time: 49.4 s


In [438]:
a.show()

C=CC(C(C=O)=CC(O)=O)(C)O
└── C(C(C)C(C=O)C(C=C)(C)O)(O)=O
    ├── C(C=C(C(O)=O)C)=O
    │   └── C(C(C)(CC=O)O)(O)=O
    │       ├── C(C(C)=O)(O)=O
    │       └── C(C)=O
    │           └── C(C(C)=O)(O)=O
    └── C=CC(C)O
        └── CC(CCO)O
            ├── C(CC(C)O)=O
            │   ├── C(C)=O
            │   │   └── C(C(C)=O)(O)=O
            │   └── C(C)=O
            │       └── C(C(C)=O)(O)=O
            ├── C=O
            │   └── C(C(CO)O)(O)=O
            │       ├── C(C(C)=O)(O)=O
            │       └── O
            └── O



In [439]:
a.show(data_property='path')

['C=CC(C(C=O)=CC(O)=O)(C)O']
└── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O']
    ├── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C(C=C(C(O)=O)C)=O']
    │   └── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C(C=C(C(O)=O)C)=O', 'C(C(C)(CC=O)O)(O)=O']
    │       ├── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C(C=C(C(O)=O)C)=O', 'C(C(C)(CC=O)O)(O)=O', 'C(C(C)=O)(O)=O']
    │       └── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C(C=C(C(O)=O)C)=O', 'C(C(C)(CC=O)O)(O)=O', 'C(C)=O']
    │           └── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C(C=C(C(O)=O)C)=O', 'C(C(C)(CC=O)O)(O)=O', 'C(C)=O', 'C(C(C)=O)(O)=O']
    └── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C=CC(C)O']
        └── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C=CC(C)O', 'CC(CCO)O']
            ├── ['C=CC(C(C=O)=CC(O)=O)(C)O', 'C(C(C)C(C=O)C(C=C)(C)O)(O)=O', 'C=CC(C)O', 'CC(CCO)O