In [2]:
import pandas as pd
import numpy as np
import os, sys
from openeye.oechem import *
from openeye.oequacpac import *


def prepare_inchi(mol):
    """Create InChI hashes with explicit, fixed hydrogen layer from an OEGraphMol."""
    opts = OEInChIOptions()
    # Do not add hydrogens.
    opts.SetHydrogens(False)
    # Return the fixed hydrogen layer
    opts.SetFixedHLayer(True)
    return OECreateInChI(mol, opts)

# SAMPL7 molecules
df_sm = pd.read_csv("SAMPL7_molecule_ID_and_SMILES.csv")
df_sm

Unnamed: 0,SAMPL7 Molecule ID,isomeric SMILES
0,SM25,O=C(NS(C1=CC=CC=C1)(=O)=O)CCC2=CC=CC=C2
1,SM26,O=S(CCC1=CC=CC=C1)(NC(C)=O)=O
2,SM27,O=S(CCC1=CC=CC=C1)(NC2(C)COC2)=O
3,SM28,O=S(CC1(NC(C)=O)CCC2=CC=CC=C2)(C1)=O
4,SM29,CS(NC1(COC1)CCC2=CC=CC=C2)(=O)=O
5,SM30,O=S(NC1(COC1)CCC2=CC=CC=C2)(C3=CC=CC=C3)=O
6,SM31,O=S(NC1(COC1)CCC2=CC=CC=C2)(N(C)C)=O
7,SM32,CS(NC1(CSC1)CCC2=CC=CC=C2)(=O)=O
8,SM33,O=S(NC1(CSC1)CCC2=CC=CC=C2)(C3=CC=CC=C3)=O
9,SM34,O=S(NC1(CSC1)CCC2=CC=CC=C2)(N(C)C)=O


In [4]:
#From https://github.com/openforcefield/openforcefield/blob/864990d06d1153e6b632b94df1df756a9a5f071c/openforcefield/utils/toolkits.py#L2319
def off_enumerate_tautomers(molecule, max_states=20):
    """
    Enumerate the possible tautomers of the current molecule.
    Parameters
    ----------
    molecule: openforcefield.topology.Molecule
        The molecule whose state we should enumerate
    max_states: int optional, default=20
        The maximum amount of molecules that should be returned
    Returns
    -------
    molecules: List[openforcefield.topology.Molecule]
        A list of openforcefield.topology.Molecule instances not including the input molecule.
    """

    from rdkit.Chem.MolStandardize import rdMolStandardize
    from rdkit import Chem

    enumerator = rdMolStandardize.TautomerEnumerator()
    rdmol = Chem.RemoveHs(molecule.to_rdkit())

    tautomers = enumerator.Enumerate(rdmol)

    # make a list of openforcefield molecules excluding the input molecule
    molecules = []
    for taut in tautomers:
        taut_hs = Chem.AddHs(taut)
        #mol = self.from_smiles(Chem.MolToSmiles(taut_hs), allow_undefined_stereo=True)
        mol = Molecule.from_smiles(Chem.MolToSmiles(taut_hs), allow_undefined_stereo=True)
        if mol != molecule:
            molecules.append(mol)

    return molecules[:max_states]

In [6]:
# Adapted from https://github.com/bergazin/sampl6-physicochemical-properties/blob/master/pKa_microstate_enumeration/microstate-enumeration-with-Epik-and-OpenEye/generate_microstates_for_all_molecules.ipynb

from openforcefield.topology import Molecule
  
for i, row in df_sm.iterrows():
    molecule_ID = row["SAMPL7 Molecule ID"]
    smiles = row["isomeric SMILES"]
    print(molecule_ID, ":", smiles)
    
    # Create the SMILES file
    !echo "Creating molecule.smi containing SMILES string..."
    #!echo "c1ccc(cc1)n2c3c(cn2)c(ncn3)N" >> molecule.smi
    with open("molecule.smi", "w") as input_file:
        input_file.write(smiles)
        input_file.write("\n")
    
    
    ####################################################
    #### Enumerate microstates with OpenEye QuacPac ####
    ####################################################
    
    # Enumerate tautomers/protomers with OpenEye QuacPac
    !echo "Generating microstates with OpenEye..."

    ifs = oemolistream()
    if not ifs.open("molecule.smi"):
        OEThrow.Fatal("Unable to open SMI for reading")

    # Tautomer enumeration options
    tautomer_maxCount = 200
    tautomerOptions = OETautomerOptions(tautomer_maxCount)
    #tautomerOptions.SetLevel(5)
    #tautomerOptions.SetMaxSearchTime(240)
    tautomerOptions.SetApplyWarts(False)
    tautomerOptions.SetCarbonHybridization(False)
    tautomerOptions.SetSaveStereo(True)
    

    # Formal charge enumeration options
    charge_maxCount = 200
    chargeOptions = OEFormalChargeOptions(charge_maxCount)

    mol = OEGraphMol()

    # Unique microstate SMILES will be stored in a set
    smiles_set = set()

    while OEReadMolecule(ifs, mol):
        OERemoveFormalCharge(mol)

        ### Enumerate charges first and tautomers for each charged state
        for charged_mol in OEEnumerateFormalCharges(mol, chargeOptions):
            # Enumerate tautomers using OFF code
            # Convert OE molecule to OFF molecule
            off_molecule = Molecule.from_openeye(charged_mol)
            for off_tautomer in off_enumerate_tautomers(off_molecule, max_states=200):
                #convert off_molecule to OEMOL
                oemol = off_tautomer.to_openeye()
                smiles = OEMolToSmiles(oemol)
                smiles_set.add(smiles) # unique SMILES are added to the set
            
            # Enumerate tautomers using OE tools    
            for oe_tautomer in OEEnumerateTautomers(charged_mol, tautomerOptions):
                smiles = OEMolToSmiles(oe_tautomer)
                smiles_set.add(smiles) # unique SMILES are added to the set
                

        ### Enumerate tautomers first and charges after for each tautomer
        
        # Convert OE molecule to OFF molecule
        off_molecule = Molecule.from_openeye(mol)
        
        # Enumerate tautomers using OFF based code, then charges with OE tools
        for off_tautomer in off_enumerate_tautomers(off_molecule, max_states=200):
            # Convert OFF molecule to OE molecule
            oemol = off_tautomer.to_openeye()
            # Enumerate charges with OE tools
            for charged_tautomer in OEEnumerateFormalCharges(oemol, chargeOptions):
                smiles = OEMolToSmiles(oemol)
                smiles_set.add(smiles)# unique SMILES are added to the set      
        
        # Enumerate tautomers with OE tools, then formal charges with OE tools        
        for tautomer in OEEnumerateTautomers(mol, tautomerOptions):
            for charged_tautomer in OEEnumerateFormalCharges(tautomer, chargeOptions):
                smiles = OEMolToSmiles(charged_tautomer)
                smiles_set.add(smiles) # unique SMILES are added to the set  
                

    with open("oe_off_microstates.smi", "w") as output:
        for smiles in smiles_set:
            output.write(smiles)
            output.write("\n")

    print("Done!")

    !wc -l oe_off_microstates.smi
    !cp oe_off_microstates.smi oe_off_microstates.csv

    # Clean up
    #!trash molecule.smi
    os.remove("molecule.smi")
    

    
    #### Create a set to store Canonical Isomeric SMILES of unique microstates
    microstates_set = set()

    # Convert OE output to OpenEye Canonical Isomeric SMILES

    df_oe_off_microstates = pd.read_csv("oe_off_microstates.csv", header=None)
    df_oe_off_microstates.columns = ["OpenEye output"]
    df_oe_off_microstates["Canonical Isomeric SMILES"] = None

    for i, row in enumerate(df_oe_off_microstates.iterrows()):
        smiles = row[1].values[0]

        mol = OEGraphMol()
        OESmilesToMol(mol, smiles)
        canonical_smiles = OEMolToSmiles(mol)

        df_oe_off_microstates.loc[i, "Canonical Isomeric SMILES"] = canonical_smiles
        microstates_set.add(canonical_smiles)

    # Number of microstates with unique canonical isomeric SMILES.
    # It may still include replicates due to resonance structures.
    print("Number of generated canonical isomeric SMILES (microstates + resonance str.): ", len(microstates_set))
    
    
    #### Remove duplicate resonance structures of the same microstate ####
    
    # Canonical isomeric SMILES are different for tautomers, protomers and resonance structures. 
    # We will only consider tautomers and protomers as unique microstates in this study.
    
    inchi_set = set()
    smiles_for_removal_set = set()

    for smiles in microstates_set:
        mol = OEGraphMol()
        OESmilesToMol(mol, smiles)
        inchi = prepare_inchi(mol)

        #if the same InChI is already in InChI set, drop microstate from microstates set.
        if inchi in inchi_set:
            print("Duplicate resonance structure detected. Remove:", smiles)
            smiles_for_removal_set.add(smiles)

        else:
            inchi_set.add(inchi)

    for smiles in smiles_for_removal_set:
        microstates_set.remove(smiles)

    print(len(microstates_set), " microstates were generated for ", molecule_ID, "." )
    
    
    #### Write generated microstates ####
    
    with open("{}_microstate_SMILES.smi".format(molecule_ID), "w") as output:
        for smiles in microstates_set:
            output.write(smiles)
            output.write("\n")
        
    df = pd.read_csv("{}_microstate_SMILES.smi".format(molecule_ID), header =None)
    df.columns = ["Canonical Isomeric SMILES"]
    df["Microstate ID"] = None

    for i, row in enumerate(df.iterrows()):
        id = molecule_ID+"_micro"+str(i+1).zfill(3) 
        df.loc[i, "Microstate ID"]=id

    df_microstate_ID = pd.DataFrame()
    df_microstate_ID["microstate ID"] = df["Microstate ID"] 
    df_microstate_ID["canonical isomeric SMILES"] = df["Canonical Isomeric SMILES"]
    df_microstate_ID.to_csv("{}_microstates.csv".format(molecule_ID), index=False)
    
    print("Finished writing microstate files for {}!".format(molecule_ID))
    print("\n")
    print("\n")

SM25 : O=C(NS(C1=CC=CC=C1)(=O)=O)CCC2=CC=CC=C2
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...


Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e6b3ae0> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e6b5780> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e6b5de0> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e6b2900> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e6b5c30> >]



Done!
       6 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  6
Duplicate resonance structure detected. Remove: c1ccc(cc1)CCC(=NS(=O)(=O)c2ccccc2)[O-]
5  microstates were generated for  SM25 .
Finished writing microstate files for SM25!




SM26 : O=S(CCC1=CC=CC=C1)(NC(C)=O)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...


Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e6aff00> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e636b40> >]



Done!
       6 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  6
Duplicate resonance structure detected. Remove: CC(=O)[N-]S(=O)(=O)CCc1ccccc1
5  microstates were generated for  SM26 .
Finished writing microstate files for SM26!




SM27 : O=S(CCC1=CC=CC=C1)(NC2(C)COC2)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...
Done!
       2 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  2
2  microstates were generated for  SM27 .
Finished writing microstate files for SM27!




SM28 : O=S(CC1(NC(C)=O)CCC2=CC=CC=C2)(C1)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...


Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1ebd6bd0> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1eb25f00> >]



Done!
       3 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  3
3  microstates were generated for  SM28 .
Finished writing microstate files for SM28!




SM29 : CS(NC1(COC1)CCC2=CC=CC=C2)(=O)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...
Done!
       2 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  2
2  microstates were generated for  SM29 .
Finished writing microstate files for SM29!




SM30 : O=S(NC1(COC1)CCC2=CC=CC=C2)(C3=CC=CC=C3)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...
Done!
       2 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  2
2  microstates were generated for  SM30 .
Finished writing microstate files for SM30!




SM31 : O=S(NC1(COC1)CCC2=CC=CC=C2)(N(C)C)=O
Creating molecule.smi containing SMILES string...
Generating 

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e72d810> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1e6b3540> >]



Done!
       4 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  4
Duplicate resonance structure detected. Remove: CS(=O)(=O)N=c1cc(o[n-]1)c2ccccc2
3  microstates were generated for  SM41 .
Finished writing microstate files for SM41!




SM42 : O=S(NC1=NOC(C2=CC=CC=C2)=C1)(C3=CC=CC=C3)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...


Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1eb1e810> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1ebd0360> >]



Done!
       4 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  4
Duplicate resonance structure detected. Remove: c1ccc(cc1)c2cc(no2)[N-]S(=O)(=O)c3ccccc3
3  microstates were generated for  SM42 .
Finished writing microstate files for SM42!




SM43 : O=S(NC1=NOC(C2=CC=CC=C2)=C1)(N(C)C)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...


Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1fca5180> >]

Problematic bonds are: [<openeye.oechem.OEBondBase; proxy of <Swig Object of type 'OEChem::OEBondBase *' at 0x7fcb1ebf25a0> >]



Done!
       4 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  4
Duplicate resonance structure detected. Remove: CN(C)S(=O)(=O)[N-]c1cc(on1)c2ccccc2
3  microstates were generated for  SM43 .
Finished writing microstate files for SM43!




SM44 : O=S(NC(N=N1)=CN1C2=CC=CC=C2)(C)=O
Creating molecule.smi containing SMILES string...
Generating microstates with OpenEye...
Done!
       5 oe_off_microstates.smi
Number of generated canonical isomeric SMILES (microstates + resonance str.):  5
Duplicate resonance structure detected. Remove: CS(=O)(=O)[N-]c1c[n+](n[n-]1)c2ccccc2
Duplicate resonance structure detected. Remove: CS(=O)(=O)[N-]c1cn(nn1)c2ccccc2
Duplicate resonance structure detected. Remove: CS(=O)(=O)N=c1c[n+]([n-][n-]1)c2ccccc2
2  microstates were generated for  SM44 .
Finished writing microstate files for SM44!




SM45 : O=S(NC(N=N1)=CN1C2=CC=CC=C2)(C3=CC=CC=C3)=O
Creating molecule.smi containing SMILES string...
Generating mic