In [1]:
import numpy as np
import pandas as pd
from equilibrator_api import ComponentContribution, Q_
CC = ComponentContribution()
from equilibrator_assets.generate_compound import create_compound, get_or_create_compound

In [2]:
from ast import literal_eval

In [3]:
def CacheGen(filepath):
    df = pd.read_csv(filepath, sep='\t') 
    
    compounds = []
    Reagents = []
    Products = []
    
    for i in range(len(df['Index'])):
        reagents = literal_eval(df['Reagents'][i])
        Reagents.append(reagents)
        products = literal_eval(df['Products'][i])
        Products.append(products)
        for j in range(len(reagents)):
            if reagents[j] not in compounds:
                compounds.append(reagents[j])
        for k in range(len(products)):
            if products[k] not in compounds:
                compounds.append(products[k])
    
    compound_cache = get_or_create_compound(CC.ccache, compounds, mol_format="smiles", error_log='./ErrorLog.tsv')
    
    return(compound_cache, compounds)

In [4]:
def ThermoGen(filepath, compound_cache, compounds, name):
    rels = pd.read_csv(filepath, sep='\t')
    indexes = []
    rules = []
    Reagents = []
    Products = []
    for i in range(len(rels['Index'])):
        indexes.append(rels['Index'][i])
        rules.append(rels['Rule'][i])
        reagents = literal_eval(rels['Reagents'][i])
        Reagents.append(reagents)
        products = literal_eval(rels['Products'][i])
        Products.append(products)
        
    mus = []
    sigma_vecs = []
    for c in compound_cache:
        mu = (CC.predictor.preprocess.get_compound_prediction(c))[0]
        sigma_vec = (CC.predictor.preprocess.get_compound_prediction(c))[1]
        mus.append(mu)
        sigma_vecs.append(sigma_vec)
    
    error_log = pd.read_csv('./ErrorLog.tsv', sep='\t')
    final_compounds = []
    for i in range(len(compounds)):
        if error_log['status'][i] == 'valid':
            final_compounds.append(compounds[i])

    print(len(mus))
    print(len(final_compounds))
    
    EnergyChanges = []
    for i in range(len(rels['Index'])):
        dummy_mus = []
        dummy_sigma_vecs = []
        dummy_compounds = []
        dummy_coefficients = []
        reagents = literal_eval(rels['Reagents'][i])
        products = literal_eval(rels['Products'][i])
        for j in range(len(reagents)):
            dummy_compounds.append(reagents[j])
            dummy_coefficients.append(-1)
        for k in range(len(products)):
            dummy_compounds.append(products[k])
            dummy_coefficients.append(1)
        valid_reaction = True
        for m in range(len(dummy_compounds)):
            if dummy_compounds[m] not in final_compounds:
                valid_reaction = False
                break
            else: 
                dummy_mus.append(mus[final_compounds.index(dummy_compounds[m])])
                dummy_sigma_vecs.append(sigma_vecs[final_compounds.index(dummy_compounds[m])])
        if valid_reaction == True:
            S = np.zeros(len(dummy_compounds))
            for n in range(len(dummy_coefficients)):
                S[n] = dummy_coefficients[n]
            dummy_mus = Q_(dummy_mus, "kJ/mol")
            dummy_sigma_vecs = Q_(dummy_sigma_vecs, "kJ/mol")
            standard_dgs = S.T @ dummy_mus
            U = S.T @ dummy_sigma_vecs
            EnergyChanges.append(standard_dgs._magnitude.round(2))
        else:
            EnergyChanges.append('NaN')
    
    outputdata = {'Index':indexes, 'Reagents':Reagents, 'Products':Products, 'Rule':rules, 'Energy Change':EnergyChanges}
    outputdf = pd.DataFrame(outputdata)
    outputdf.to_csv(f'{name}RelsWithThermo.tsv', header=None, index=None, sep='\t', mode='a')
    return(outputdf)
        

In [5]:
%%time
a, b = CacheGen('./formoseammprocessedrelsdata/FormoseAmm_4ProcessedRels.tsv')




CPU times: user 5h 20min, sys: 3h 15min 42s, total: 8h 35min 43s
Wall time: 11h 42min 41s


In [6]:
%%time
a = ThermoGen('./formoseammprocessedrelsdata/FormoseAmm_4ProcessedRels.tsv', a, b, 'FormoseAmm_4')

35291
35291
CPU times: user 15min 25s, sys: 6.53 s, total: 15min 31s
Wall time: -1.01e+13 ns


In [72]:
a

Unnamed: 0,Index,Reagents,Products,Rule,Energy Change
0,5_0,[C(C(C(C(C(CO)O)O)O)O)=O],"[C(CO)=O, C(C(C(CO)O)O)=O]",Retro Aldol,13.37
1,5_1,[C(C(C(C(C(CO)O)O)O)O)=O],"[C(CO)=O, C(C(C(CO)O)O)=O]",Knoevenagel H (inv),13.37
2,7_0,[C(C(C(C(C(CO)O)O)O)O)=O],[C(CO)(C(C(C(CO)O)O)O)=O],Keto-enol migration twice,-2.0
3,9_0,"[N, C(C(C(C(C(CO)O)O)O)O)=O]","[O, C(CN)(C(C(C(CO)O)O)O)=O]",Amadori/Heyns Rearrangement,-20.34
4,12_0,"[O, C(C(C(C(C(CO)O)O)O)O)=O, C(C(C(C(C(CO)O)O)...","[C(C(C(C(C(CO)O)O)O)O)(O)=O, C(C(C(C(C(CO)O)O)...","Cannizarro 2, glucose (oxidation)",-17.27
...,...,...,...,...,...
108347,120083_0,"[C(C(C(C=O)O)O)(CCN)=O, C(C(CC(C(C=O)O)O)N)(O)=O]","[C(CC(C(C=O)O)O)=O, C(=O)=O, C(C(C(C(CCN)N)O)O...","Strecker Degradation, CC",-52.53
108348,120084_0,"[C(C(C(C=O)O)O)(CCN)=O, C(C(C(CC(CO)O)=O)N)(O)=O]","[C(C(CC(CO)O)=O)=O, C(=O)=O, C(C(C(C(CCN)N)O)O...","Strecker Degradation, CC",-80.1
108349,120085_0,"[C(C(C(C=O)O)O)(CCN)=O, C(CC(C(O)=O)N)(C(CO)O)=O]","[C(CC(C(CO)O)=O)=O, C(=O)=O, C(C(C(C(CCN)N)O)O...","Strecker Degradation, CC",-65.36
108350,120086_0,"[C(C(C(C=O)O)O)(CCN)=O, C(C(C(C(CCO)=O)O)N)(O)=O]","[C(C(C(CCO)=O)O)=O, C(=O)=O, C(C(C(C(CCN)N)O)O...","Strecker Degradation, CC",-109.48
