Calculating all paths, energies (suitable for SMILES strings)

In [1]:
from openbabel import openbabel

In [2]:
from rdkit import Chem
from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import Crippen

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as ss

In [4]:
from treelib import Node, Tree

In [5]:
from ast import literal_eval

In [6]:
class Molecule(object):
    def __init__(self, path):
        self.path = path

In [106]:
def split(word):
    return[char for char in word]

In [107]:
def tree_copier(tree, identifier_factor):
    tree_copy = Tree()
    nodes = tree.all_nodes()
    for i in range(len(nodes)):
        dummy_tag = nodes[i].tag
        dummy_identifier = nodes[i].identifier
        dummy_data = nodes[i].data
        try:
            dummy_parent = (tree.parent(dummy_identifier)).identifier 
        except:
            dummy_parent = -1
        if dummy_parent == -1:
            tree_copy.create_node(dummy_tag, (dummy_identifier+identifier_factor), data = dummy_data)
        else:
            tree_copy.create_node(dummy_tag, (dummy_identifier+identifier_factor), parent=(dummy_parent+identifier_factor), data = dummy_data)
    return(tree_copy)

In [108]:
def index_finder(Product, rels, path):
    indexes = []
    for i in range(len(rels['Index'])):
        if rels['Energy Change'][i] != 'NaN':
            if rels['Energy Change'][i] < 0: 
                place = literal_eval(rels['Products'][i])
                for j in range(len(place)):
                    if place[j] == Product:
                        indexes.append(rels['Index'][i])
    valid_indexes = []
    for i in range(len(indexes)):
        valid = True
        precursors = precursor_finder(indexes[i], rels)
        for j in range(len(precursors)):
            if precursors[j] in path:
                valid = False
                break
        if valid == True:
            valid_indexes.append(indexes[i])
    return(valid_indexes)

In [109]:
def precursor_finder(index, rels):
    precursors = []
    for i in range(len(rels['Index'])):
        if rels['Index'][i] == index:
            dummy = literal_eval(rels['Reagents'][i])
            for j in range(len(dummy)):
                precursors.append(dummy[j])
    return(precursors)

In [110]:
def find_component(node, rels):
    Product = node.tag
    path = (node.data).path
    indexes = index_finder(Product, rels, path)
    precursors = []
    for i in range(len(indexes)):
        precursors.append(precursor_finder(indexes[i], rels))
    return(indexes, precursors)

In [187]:
def map_tree(Smiles, rels, max_trees):
    #rels = pd.read_csv(rels, sep='\t')
    #base_molecules = ['F', 'G', 'H', 'I', 'J', 'K']
    #base_molecules = ['N', 'C=O', 'C(CO)=O', 'O'] #FormoseAmm
    base_molecules = ['C=O', 'C(CO)=O', 'O'] #Formose
    all_trees = []
    tree_statuses = []
    tree1 = Tree()
    tree1.create_node(Smiles, 0, data=Molecule([Smiles]))
    all_trees.append(tree1)
    tree_statuses.append(False)
    reactions = [[]]
    
    complete = False
    early_complete = False
    while complete == False and early_complete == False:
        for i in range(len(all_trees)):
            if tree_statuses[i] == False:
                finished = False
                current_nodes = all_trees[i].all_nodes()
                current_depths = []
                for j in range(len(current_nodes)):
                    current_depths.append(all_trees[i].depth(current_nodes[j]))
                level_counter = max(current_depths)
                node_counter = len(current_nodes)-1
                while finished == False:
                    nodes = all_trees[i].all_nodes()
                    active_nodes = []
                    nodes_with_children = []
                    for k in range(len(nodes)):
                        try:     
                            temp = (all_trees[i].parent(nodes[k].identifier)).identifier
                        except:
                            temp = -1
                        if temp != -1:
                            nodes_with_children.append(temp)
                    for k in range(len(nodes)):
                        tag = nodes[k].tag
                        level = all_trees[i].depth(nodes[k])
                        if nodes[k].identifier not in nodes_with_children and tag not in base_molecules:
                            active_nodes.append(nodes[k])
                    if active_nodes == []:
                        finished = True
                        break
                    else:
                        for z in range(len(active_nodes)):
                            indexes, precursors = find_component(active_nodes[z], rels)
                            if len(precursors) == 0:
                                all_trees[i] = 'NaN'
                                finished = True
                                break
                            else:
                                product = active_nodes[z].identifier
                                dummy = (active_nodes[z].data).path
                                place = []
                                num_trees = len(all_trees)
                                for m in range(len(dummy)):
                                    place.append(dummy[m])
                                if len(precursors) > 1:
                                    for p in range(1, len(precursors)):
                                        tree = tree_copier(all_trees[i], (num_trees*1000))
                                        dummy_product = product 
                                        tree_statuses.append(False)
                                        dummy_node_counter = num_trees*1000 + node_counter
                                        for q in range(len(precursors[p])):
                                            dummy_node_counter += 1
                                            tree.create_node(precursors[p][q], dummy_node_counter, parent=(product+num_trees*1000), data=Molecule(place + [precursors[p][q]])) 
                                        all_trees.append(tree)
                                        reactions.append(reactions[i] + [indexes[p]])
                                        num_trees+=1
                                for n in range(len(precursors[0])):
                                    node_counter +=1
                                    all_trees[i].create_node(precursors[0][n], node_counter, parent=product, data=Molecule(place + [precursors[0][n]])) 
                                reactions[i].append(indexes[0])
            if finished == True:
                tree_statuses[i] = True
        #print(f'No. trees = {len(all_trees)}')
        num_complete_trees = 0
        for i in range(len(tree_statuses)):
            if tree_statuses[i] == True and all_trees[i] != 'NaN':
                num_complete_trees += 1
        #print(f'No. complete trees = {num_complete_trees}')    
        final_trees = []
        final_reactions = []
        if num_complete_trees >= max_trees:
            early_complete = True
        else:
            dummy = True
            for i in range(len(tree_statuses)):
                if tree_statuses[i] == False:
                    dummy = False
            if dummy == True:
                complete = True
    if early_complete == True or complete == True:
        final_trees = []
        final_reactions = []
        for i in range(len(all_trees)):
            if tree_statuses[i] == True and all_trees[i] != 'NaN':
                final_trees.append(all_trees[i])
                final_reactions.append(reactions[i])
        final_energies = []
        EnergyChanges = []
        ReactionIDs = []
        for i in range(len(rels['Index'])):
            EnergyChanges.append(rels['Energy Change'][i])
            ReactionIDs.append(rels['Index'][i])
        for i in range(len(final_trees)):
            dummy = 0
            for j in range(len(final_reactions[i])):
                dummy += EnergyChanges[ReactionIDs.index(final_reactions[i][j])].round(2)
            final_energies.append(dummy)
        data = {'Tree':final_trees, 'Reaction IDs':final_reactions, 'Energy Change':final_energies}
    df = pd.DataFrame(data)
    #df.to_csv('Testdf.csv', header=None, index=None, sep='\t', mode='a')
    return(final_trees, final_reactions, final_energies)

In [188]:
def gen_finder(string):
    string = list(string)
    return(int(string[1]))

In [205]:
def pathway_finder(matches_file, network, min_generation, max_generation): 
    matches_data = pd.read_csv(matches_file, sep='\t')
    rels_data = []
    for i in range(min_generation, max_generation+1):
        rels_data.append(pd.read_csv(f'./RelsDataWithThermo/{network}/{network}_{i}RelsWithThermo.tsv', sep='\t'))
    all_smiles = []
    all_inchi = []
    all_gen = []
    all_trees = []
    all_energies = []
    all_reactions = []
    for i in range(100, 101): #len(matches_data['Generation'])
        gen = gen_finder(matches_data['Generation'][i])
        trees, reactions, energies = map_tree(matches_data['Smiles'][i], rels_data[gen-1], 10e10)
        all_trees.append(trees)
        all_energies.append(energies)
        all_reactions.append(reactions)
        all_smiles.append(matches_data['Smiles'][i])
        all_inchi.append(matches_data['Inchi'][i])
        all_gen.append(matches_data['Generation'][i])
            
    data = {'Generation':all_gen, 'Smiles':all_smiles, 'Inchi':all_inchi, 'Pathways':all_trees, 'Path Energies':all_energies, 'Reaction IDs':all_reactions}
    df = pd.DataFrame(data)
    return(df)
    

In [206]:
%%time
a = pathway_finder('./MatchesFiles/FormoseMatches.tsv', 'Formose', 1, 5)

KeyboardInterrupt: 

In [22]:
a['Reaction IDs']

0    [28_0, 4_0]
1    [28_0, 4_1]
Name: Reaction IDs, dtype: object

In [23]:
a['Energy Change']

0   -63.01
1   -63.01
Name: Energy Change, dtype: float64

In [59]:
def parser(string): #read in paths from csv file and display nicely
    chars = list(string)
    newline_indexes = []
    for i in range(len(chars)):
        if chars[i] == '\n':
            newline_indexes.append(i)
    dummy = ''
    for i in range(0, newline_indexes[0]):
        dummy += chars[i]
    print(dummy)
    for i in range(len(newline_indexes)-1):
        dummy = ''
        for j in range(newline_indexes[i]+1, newline_indexes[i+1]):
            dummy += chars[j]
        print(dummy)

In [572]:
df1 = pd.read_csv('./TestDF.csv', sep='\t')

In [575]:
parser(df1['Tree'][0])

A
├── D
│   └── I
└── E
    └── K
