In [1]:
import sys
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski
from rdkit.Chem import PandasTools
import pandas as pd
from sklearn.linear_model import LinearRegression
from collections import namedtuple

In [2]:
class ESOLCalculator:
    def __init__(self):
        self.aromatic_query = Chem.MolFromSmarts("a")
        self.Descriptor = namedtuple("Descriptor", "mw logp rotors ap")

    def calc_ap(self, mol):
        """
        Calculate aromatic proportion #aromatic atoms/#atoms total
        :param mol: input molecule
        :return: aromatic proportion
        """
        matches = mol.GetSubstructMatches(self.aromatic_query)
        return len(matches) / mol.GetNumAtoms()

    def calc_esol_descriptors(self, mol):
        """
        Calcuate mw,logp,rotors and aromatic proportion (ap)
        :param mol: input molecule
        :return: named tuple with descriptor values
        """
        mw = Descriptors.MolWt(mol)
        logp = Crippen.MolLogP(mol)
        rotors = Lipinski.NumRotatableBonds(mol)
        ap = self.calc_ap(mol)
        return self.Descriptor(mw=mw, logp=logp, rotors=rotors, ap=ap)

    def calc_esol_orig(self, mol):
        """
        Original parameters from the Delaney paper, just here for comparison
        :param mol: input molecule
        :return: predicted solubility
        """
        # just here as a reference don't use this!
        intercept = 0.16
        coef = {"logp": -0.63, "mw": -0.0062, "rotors": 0.066, "ap": -0.74}
        desc = self.calc_esol_descriptors(mol)
        esol = intercept + coef["logp"] * desc.logp + coef["mw"] * desc.mw + coef["rotors"] * desc.rotors \
               + coef["ap"] * desc.ap
        return esol

    def calc_esol(self, mol):
        """
        Calculate ESOL based on descriptors in the Delaney paper, coefficients refit for the RDKit using the
        routine refit_esol below
        :param mol: input molecule
        :return: predicted solubility
        """
        intercept = 0.26121066137801696
        coef = {'mw': -0.0066138847738667125, 'logp': -0.7416739523408995, 'rotors': 0.003451545565957996, 'ap': -0.42624840441316975}
        desc = self.calc_esol_descriptors(mol)
        esol = intercept + coef["logp"] * desc.logp + coef["mw"] * desc.mw + coef["rotors"] * desc.rotors \
               + coef["ap"] * desc.ap
        return esol

In [24]:
path1 = './MatchedDataNoStereo/FormoseAmm/FormoseAmmDescriptors.tsv'
path2 = './MatchedDataNoStereo/FormoseFinal/FormoseFinalDescriptors.tsv'
path3 = './MatchedDataNoStereo/GlucoseAmm/GlucoseAmmDescriptors.tsv'
path4 = './MatchedDataNoStereo/GlucoseFinal/GlucoseFinalDescriptors.tsv'
path5 = './MatchedDataNoStereo/PyruvicAcid/PyruvicAcidDescriptors.tsv'

In [25]:
def calculate_esol(array, smiles_position):
    esol_calculator = ESOLCalculator()
    Smi = array[smiles_position]
    mol = Chem.MolFromSmiles(Smi)
    esol = esol_calculator.calc_esol(mol)
    return(esol)

In [26]:
def add_esol(path, smiles_position, name):
    df = pd.read_csv(path, sep='\t')
    df['ESOL'] = df.apply(calculate_esol, axis=1, raw=True, result_type='expand', args=[smiles_position])
    df.to_csv(f'{name}Descriptors.tsv', header=None, index=None, sep='\t', mode='a')
    return(df)

In [33]:
df = add_esol(path5, 1, 'PyruvicAcid')

In [34]:
df

Unnamed: 0,Generation,Smiles,Inchi,Molecular Weight,Molecular Formula,No. Carbons,No. Hydrogens,No. Oxygens,cLogP,cLogS,H-Acceptors,H-Donors,Polar Surface Area,Druglikeness,No. Heteroatoms,ESOL
0,G1,C(C(CO)O)(O)=O,RBNPOMFGQQGHHO,106.026609,C3H6O4,3,6,4,-1.7932,0.177,4,3,77.76,-0.24550,4,0.735263
1,G2,C(C(CO)(C(C)O)O)(O)=O,WKRKESUWTIYINN,150.052823,C5H10O5,5,10,5,-1.9800,0.197,5,4,97.99,0.77366,5,0.632029
2,G3,C(C(C(CC(O)=O)O)C(C)O)=O,YTCZJXDURLPWNQ,176.068473,C7H12O5,7,12,5,-1.4458,-0.609,5,3,94.83,-2.13140,5,-0.158288
3,G3,C(C(C=O)C(CC(C)O)O)(O)=O,IHSFWOVWMRACIR,176.068473,C7H12O5,7,12,5,-1.4458,-0.609,5,3,94.83,-5.09350,5,-0.158288
4,G3,C(C(C(C)O)C(C(CO)=O)O)=O,CPUSMWJJBISCJI,176.068473,C7H12O5,7,12,5,-2.1492,-0.279,5,3,94.83,-2.08150,5,0.519082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4966,G6,C(C(CCO)O)=CC(O)=O,VXPJFXNRJWTXQI,146.057909,C6H10O4,6,10,4,-0.6822,-0.405,4,3,77.76,-0.77902,4,-0.224666
4967,G6,C(C(CCO)O)=C(C(O)=O)C=O,NWSYMZZMMOJWHB,174.052823,C7H10O5,7,10,5,-1.2148,-0.237,5,3,94.83,-3.13860,5,-0.086882
4968,G6,C=CC(C(C=CC(O)=O)O)O,URGIIXSTFYWKTB,158.057909,C7H10O4,7,10,4,-0.5084,-0.722,4,3,77.76,-3.58150,4,-0.426110
4969,G6,C(C(C(CO)O)O)=CC(O)=O,JCOCWZYTGFRYRY,162.052823,C6H10O5,6,10,5,-1.7040,-0.006,5,4,97.99,0.62537,5,0.432850
