Calculating a path for each input molecule, no energy

In [1]:
from openbabel import openbabel

In [2]:
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [4]:
from ast import literal_eval

In [5]:
from treelib import Node, Tree

In [6]:
class Molecule(object):
    def __init__(self, path):
        self.path = path

In [7]:
def split(word):
    return[char for char in word]

In [8]:
def index_finder(Product, rels, path):
    index = 0
    for i in range(len(rels['Index'])):
        found_index = False
        place = literal_eval(rels['Products'][i])
        for j in range(len(place)):
            if place[j] == Product:
                valid = True
                precursors = precursor_finder(rels['Index'][i], rels)
                for k in range(len(precursors)):
                    if precursors[k] in path:
                        valid = False
                        break
                if valid == True:
                    found_index = True
            if found_index == True:
                index = rels['Index'][i]
                return(index)
                break

In [9]:
def precursor_finder(index, rels):
    precursors = []
    for i in range(len(rels['Index'])):
        if rels['Index'][i] == index:
            dummy = literal_eval(rels['Reagents'][i])
            for j in range(len(dummy)):
                precursors.append(dummy[j])
    return(precursors)

In [10]:
def find_component(node, rels):
    Product = node.tag
    path = (node.data).path
    index = index_finder(Product, rels, path)
    precursors = precursor_finder(index, rels)
    return(precursors)

In [15]:
def map_tree(matches_data, rels, name):
    
    data = pd.read_csv(matches_data, sep='\t')
    molecules = []
    generations = []
    inchi = []
    for i in range(5): #len(data['Smiles'])
        molecules.append(data['Smiles'][i])
        generations.append(data['Generation'][i])
        inchi.append(data['Inchi'][i])
        
    rels = pd.read_csv(rels, sep='\t')
    #base_molecules = ['N', 'C=O', 'C(CO)=O', 'O'] #FormoseAmm
    #base_molecules = ['C=O', 'C(CO)=O', 'O'] #Formose
    #base_molecules = ['N', 'C(C(C(C(C(CO)O)O)O)O)=O', 'O'] #GlucoseAmm
    #base_molecules = ['C(C(C(C(C(CO)O)O)O)O)=O', 'O'] #Glucose
    base_molecules = ['C(C(C)=O)(O)=O', 'O'] #PyruvicAcid
    trees = []
    
    for i in range(5): #len(molecules)
        print(i)
        tree = Tree()
        tree.create_node(molecules[i], 0, data=Molecule([molecules[i]]))
        finished = False
        level_counter = 0
        node_counter = 0
        while finished == False:
            nodes = tree.all_nodes()
            active_nodes = []
            for i in range(len(nodes)):
                tag = nodes[i].tag
                level = tree.depth(nodes[i])
                if level == level_counter and tag not in base_molecules:
                    active_nodes.append(nodes[i])
            if active_nodes == []:
                finished = True
                break
            else:
                error_status = False
                for i in range(len(active_nodes)):
                    precursors = find_component(active_nodes[i], rels)
                    if precursors == []:
                        trees.append('NaN')
                        error_status = True
                        break
                    else:
                        product = active_nodes[i].identifier
                        dummy = (active_nodes[i].data).path
                        place = []
                        for k in range(len(dummy)):
                            place.append(dummy[k])
                        for j in range(len(precursors)):
                            node_counter +=1
                            tree.create_node(precursors[j], node_counter, parent=product, data=Molecule(place + [precursors[j]]))
                level_counter+=1
            if error_status == True:
                break
        trees.append(tree)
    errors = 0
    for i in range(len(trees)):
        if trees[i] == 'NaN':
            errors +=1
    data = {'Generation':generations, 'Smiles':molecules, 'Inchi':inchi, 'Path':trees}
    df = pd.DataFrame(data)
    df.to_csv(f'{name}Trees.tsv', header=None, index=None, sep='\t', mode='a')
    print(f'No. Errors = {errors}')
    return(df)

In [16]:
%%time
a = map_tree('./MatchesFiles/PyruvicAcidMatches.tsv', './ProcessedRels/PyruvicAcid/PyruvicAcid_3ProcessedRels.tsv', 'PyruvicAcid')

0
1
2
3
4
No. Errors = 0
CPU times: user 70.2 ms, sys: 4.69 ms, total: 74.9 ms
Wall time: 79.2 ms


In [20]:
a['Path'][0].show()

C(C(CO)O)(O)=O
├── C(C(C)=O)(O)=O
└── O



In [143]:
data = pd.read_csv('./MatchesFiles/FormoseMatches.tsv', sep='\t')

In [144]:
molecules = []
generations = []
inchi = []
trees = []
for i in range(len(data['Smiles'])):
    molecules.append(data['Smiles'][i])
    generations.append(data['Generation'][i])
    inchi.append(data['Inchi'][i])
    trees.append(a['Path'][i])
dictionary = {'Generation':generations, 'Smiles':molecules, 'Inchi':inchi, 'Path':trees}
df = pd.DataFrame(dictionary)
df.to_csv('Testdf2.csv', header=None, index=None, sep='\t', mode='a')

In [142]:
a['Path'][2345].show()

C1(C(C(C(C=1)=O)O)O)C(CO)O
└── C1(C(C(C(C(CO)O)(C1)O)O)O)=O
    └── C(C(C(C(C(CO)O)=O)O)O)(C)=O
        ├── C(C(C)=O)=O
        │   └── C(C(CO)O)=O
        │       ├── C(CO)=O
        │       └── C=O
        └── C(CO)(C(CO)O)=O
            └── C(C(C(CO)O)O)=O
                ├── C(CO)=O
                └── C(CO)=O



In [94]:
def parser(string): #read in paths from csv file and display nicely
    chars = list(string)
    newline_indexes = []
    for i in range(len(chars)):
        if chars[i] == '\n':
            newline_indexes.append(i)
    dummy = ''
    for i in range(0, newline_indexes[0]):
        dummy += chars[i]
    print(dummy)
    for i in range(len(newline_indexes)-1):
        dummy = ''
        for j in range(newline_indexes[i]+1, newline_indexes[i+1]):
            dummy += chars[j]
        print(dummy)

In [134]:
df1 = pd.read_csv('./TestDF.csv', sep='\t')

In [136]:
parser(df1['Tree'][99])

C(C1(CC(C(C(C1)=O)O)O)O)=O
└── C(C)(C(C(CC(C=O)=O)O)O)=O
    ├── C(C(C(C)=O)O)=O
    │   └── C(C(C(CO)O)O)=O
    │       ├── C(CO)=O
    │       └── C(CO)=O
    └── C(C(C)=O)=O
        └── C(C(CO)O)=O
            ├── C(CO)=O
            └── C=O
