In [1]:
import sys
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, Lipinski
from rdkit.Chem import PandasTools
import pandas as pd
from sklearn.linear_model import LinearRegression
from collections import namedtuple

In [2]:
class ESOLCalculator:
    def __init__(self):
        self.aromatic_query = Chem.MolFromSmarts("a")
        self.Descriptor = namedtuple("Descriptor", "mw logp rotors ap")

    def calc_ap(self, mol):
        """
        Calculate aromatic proportion #aromatic atoms/#atoms total
        :param mol: input molecule
        :return: aromatic proportion
        """
        matches = mol.GetSubstructMatches(self.aromatic_query)
        return len(matches) / mol.GetNumAtoms()

    def calc_esol_descriptors(self, mol):
        """
        Calcuate mw,logp,rotors and aromatic proportion (ap)
        :param mol: input molecule
        :return: named tuple with descriptor values
        """
        mw = Descriptors.MolWt(mol)
        logp = Crippen.MolLogP(mol)
        rotors = Lipinski.NumRotatableBonds(mol)
        ap = self.calc_ap(mol)
        return self.Descriptor(mw=mw, logp=logp, rotors=rotors, ap=ap)

    def calc_esol_orig(self, mol):
        """
        Original parameters from the Delaney paper, just here for comparison
        :param mol: input molecule
        :return: predicted solubility
        """
        # just here as a reference don't use this!
        intercept = 0.16
        coef = {"logp": -0.63, "mw": -0.0062, "rotors": 0.066, "ap": -0.74}
        desc = self.calc_esol_descriptors(mol)
        esol = intercept + coef["logp"] * desc.logp + coef["mw"] * desc.mw + coef["rotors"] * desc.rotors \
               + coef["ap"] * desc.ap
        return esol

    def calc_esol(self, mol):
        """
        Calculate ESOL based on descriptors in the Delaney paper, coefficients refit for the RDKit using the
        routine refit_esol below
        :param mol: input molecule
        :return: predicted solubility
        """
        intercept = 0.26121066137801696
        coef = {'mw': -0.0066138847738667125, 'logp': -0.7416739523408995, 'rotors': 0.003451545565957996, 'ap': -0.42624840441316975}
        desc = self.calc_esol_descriptors(mol)
        esol = intercept + coef["logp"] * desc.logp + coef["mw"] * desc.mw + coef["rotors"] * desc.rotors \
               + coef["ap"] * desc.ap
        return esol

In [24]:
path1 = './MatchedDataNoStereo/FormoseAmm/FormoseAmmDescriptors.tsv'
path2 = './MatchedDataNoStereo/FormoseFinal/FormoseFinalDescriptors.tsv'
path3 = './MatchedDataNoStereo/GlucoseAmm/GlucoseAmmDescriptors.tsv'
path4 = './MatchedDataNoStereo/GlucoseFinal/GlucoseFinalDescriptors.tsv'
path5 = './MatchedDataNoStereo/PyruvicAcid/PyruvicAcidDescriptors.tsv'

In [25]:
def calculate_esol(array, smiles_position):
    esol_calculator = ESOLCalculator()
    Smi = array[smiles_position]
    mol = Chem.MolFromSmiles(Smi)
    esol = esol_calculator.calc_esol(mol)
    return(esol)

In [26]:
def add_esol(path, smiles_position, name):
    df = pd.read_csv(path, sep='\t')
    df['ESOL'] = df.apply(calculate_esol, axis=1, raw=True, result_type='expand', args=[smiles_position])
    df.to_csv(f'{name}Descriptors.tsv', header=None, index=None, sep='\t', mode='a')
    return(df)

In [27]:
df = add_esol(path2, 1, 'FormoseFinal')

In [28]:
df

Unnamed: 0,Generation,Smiles,Inchi,Molecular Weight,Molecular Formula,No. Carbons,No. Hydrogens,No. Oxygens,cLogP,cLogS,H-Acceptors,H-Donors,Polar Surface Area,Druglikeness,No. Heteroatoms,ESOL
0,G1,C(C(C(CO)O)O)=O,YTBSYETUWUMLBZ,120.042259,C4H8O4,4,8,4,-2.2233,0.073,4,3,77.76,-3.27480,4,1.035172
1,G2,C(C(C(C(CO)O)O)O)=O,PYMYPHUHKUWMLA,150.052823,C5H10O5,5,10,5,-2.7907,0.202,5,4,97.99,-2.87440,5,1.314038
2,G2,C(CO)(C(CO)O)=O,UQPHVQVXLPRNCX,120.042259,C4H8O4,4,8,4,-2.0422,0.237,4,3,77.76,1.21880,4,1.033985
3,G2,C(C(CO)O)O,PEDCQBHIVMGVHV,92.047344,C3H8O3,3,8,3,-1.4436,0.159,3,3,60.69,-3.07560,3,0.896201
4,G2,C(C(C(CO)O)O)O,UNXHWFMMPAWVPI,122.057909,C4H10O4,4,10,4,-2.0110,0.288,4,4,80.92,-1.13460,4,1.175068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3399,G5,C(CC(CO)CC(O)=O)(O)=O,IEPDKRIDROAXQP,162.052823,C6H10O5,6,10,5,-0.9206,-0.397,5,3,94.83,-3.38420,5,-0.455933
3400,G5,C(C(CCC=O)(C(C)O)O)(O)=O,OJWYSSAHOCVIKP,176.068473,C7H12O5,7,12,5,-1.2835,-0.558,5,3,94.83,-2.93770,5,-0.265164
3401,G5,C(CCC(C)(C(C(O)=O)O)O)=O,NHUIPZYESYXBMF,176.068473,C7H12O5,7,12,5,-1.2835,-0.558,5,3,94.83,0.97344,5,-0.265164
3402,G5,C(CCCC(C(C(O)=O)O)O)=O,LTJIULFQDNBLDI,176.068473,C7H12O5,7,12,5,-1.2097,-0.719,5,3,94.83,-5.06330,5,-0.261712
