In [2]:
import sys


import string
import numpy as np

import os
import lxml.etree as etree

#from simtk.openmm.app import element as elem
#from simtk.openmm.app import Topology

from IPython.display import Image
import matplotlib.pyplot as plt
import os
import math
import copy
import re
import numpy
import random

import openeye.oechem
import openeye.oeomega
import openeye.oequacpac

from openeye import oechem, oequacpac
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import rdBase
from rdkit.Chem import Draw
from matplotlib.lines import Line2D
from rdkit import RDConfig
from openeye.oechem import *
from openeye.oedepict import *
from IPython.core.display import Image, display
def RetrieveMol2Block(fileLikeObject, delimiter="@<TRIPOS>MOLECULE"):
    """generator which retrieves one mol2 block at a time
    """
    mol2 = []
    for line in fileLikeObject:
        if line.startswith(delimiter) and mol2:
            yield "".join(mol2)
            mol2 = []
        mol2.append(line)
    if mol2:
        yield "".join(mol2)

def getSMIRKSMatches_OEMol(oemol, smirks, aromaticity_model = None): 
    """Find all sets of atoms in the provided oemol that match the provided SMIRKS strings.

    Parameters
    ----------
    oemol : OpenEye oemol
        oemol to process with the SMIRKS in order to find matches
    smirks : str
        SMIRKS string with tagged atoms.
        If there are N tagged atoms numbered 1..N, the resulting matches will be N-tuples of atoms that match the corresponding tagged atoms.
    aromaticity_model : str (optional)
        OpenEye aromaticity model designation as a string, such as "OEAroModel_MDL". Default: None. If none is provided, molecule is processed exactly as provided; otherwise it is prepared with this aromaticity model prior to querying.

    Returns
    -------
    matches : list of tuples of atoms numbers
        matches[index] is an N-tuple of atom numbers from the oemol
        Matches are returned in no guaranteed order.
    """

    # Make a copy of molecule so we don't influence original (probably safer than deepcopy per C Bayly)
    mol = oechem.OEMol(oemol)
    OEAddExplicitHydrogens(mol)
    # Set up query.
    qmol = oechem.OEQMol()
    if not oechem.OEParseSmarts(qmol, smirks):
        raise Exception("Error parsing SMIRKS '%s'" % smirks)

    # Determine aromaticity model
    if aromaticity_model:
        if type(aromaticity_model) == str:
            # Check if the user has provided a manually-specified aromaticity_model
            if hasattr(oechem, aromaticity_model):
                oearomodel = getattr(oechem, aromaticity_model)
            else:
                raise ValueError("Error: provided aromaticity model not recognized by oechem.")
        else:
            raise ValueError("Error: provided aromaticity model must be a string.")

        # If aromaticity model was provided, prepare molecule
        oechem.OEClearAromaticFlags( mol)
        oechem.OEAssignAromaticFlags( mol, oearomodel)
        # avoid running OEPrepareSearch or we lose desired aromaticity, so instead:
        oechem.OEAssignHybridization( mol)
        oechem.OEAssignFormalCharges( mol)
        oechem.OEAssignImplicitHydrogens( mol)

    # Perform matching on each mol
    matches = list()

    # We require non-unique matches, i.e. all matches
    unique = False
    ss = oechem.OESubSearch(qmol)
    matches = []
    for match in ss.Match( mol, unique):
        # Compile list of atom indices that match the pattern tags
        atom_indices = dict()
        for ma in match.GetAtoms():
            if ma.pattern.GetMapIdx() != 0:
                atom_indices[ma.pattern.GetMapIdx()-1] = ma.target.GetIdx()
        # Compress into list
        atom_indices = [ atom_indices[index] for index in range(len(atom_indices)) ]
        # Store
        matches.append( tuple(atom_indices) )

    return matches

def getSMIRKSMatches_RDKMol(rdkmol, smirks, aromaticity_model = None):
    """Find all sets of atoms in the provided rdkmol that match the provided SMIRKS strings.
    06/04/2017

    Parameters
    ----------
    rdkmol : RDKit rdkmol
        RDKit molecule to process with the SMIRKS in order to find matches
    smirks : str
        SMIRKS string with tagged atoms.
        If there are N tagged atoms numbered 1..N, the resulting matches will be N-tuples of atoms that match the corresponding tagged atoms.
    aromaticity_model : str (optional)
        OpenEye aromaticity model designation as a string, such as "OEAroModel_MDL". Default: None. If none is provided, molecule is processed exactly as provided; otherwise it is prepared with this aromaticity model prior to querying.

    Returns
    -------
    matches : list of tuples of atoms numbers
        matches[index] is an N-tuple of atom numbers from the rdkmol
        Matches are returned in no guaranteed order.
    """

    # Make a copy of molecule so we don't influence original (probably safer than deepcopy per C Bayly)
    mol = Chem.Mol(rdkmol)
    # mol = Chem.AddHs(mol)

    # Set up query.
    qmol = Chem.MolFromSmarts(smirks)   #cannot catch the error
    ind_map = {}
    for atom in qmol.GetAtoms():
        map_num = atom.GetAtomMapNum()
        if map_num:
            ind_map[map_num - 1] = atom.GetIdx()
    map_list = [ind_map[x] for x in sorted(ind_map)]

    matches = list()

    for match in mol.GetSubstructMatches(qmol, uniquify = False) :
        mas = [match[x] for x in map_list]
        matches.append(tuple(mas))

    return matches



def tree_parse(root, forces):
    smirks = []
    for child in root:
        if child.tag in forces:
            for i in child:
                try:
    #                 print i.attrib
                    smirks.append(i.attrib["smirks"])
                except KeyError: 
                    print i
    return smirks

def get_smirks_mapping_for_single_molecule(smirks_list, mol):
    oe, rdk = mol[0], mol[1]
    
    mapping = {}
    for smirks in smirks_list:
        try :   
            x = list(set(getSMIRKSMatches_OEMol(oe, smirks)) )
            y = list(set(getSMIRKSMatches_RDKMol(rdk, smirks)))
            x.sort()
            y.sort()
            to_add = (x, y)
            if to_add != ([] , []) and x != y:
#             if to_add != ([] , []) :
                mapping[smirks] = to_add
            
        except AttributeError: print "does not read"

    return mapping



In [4]:
def merge_images(file1, file2):
    from PIL import Image
    """Merge two images into one, displayed side by side
    :param file1: path to first image file
    :param file2: path to second image file
    :return: the merged Image object
    """
    image1 = Image.open(file1)
    image2 = Image.open(file2)

    (width1, height1) = image1.size
    (width2, height2) = image2.size

    result_width = width1 + width2
    result_height = max(height1, height2)

    result = Image.new('RGB', (result_width, result_height))
    result.paste(im=image1, box=(0, 0))
    result.paste(im=image2, box=(width1, 0))
    return result


# def AtomsByIndex(atom, indices):
#     return atom.GetIdx() in indices

class AtomsByIndex(OEUnaryAtomPred):
    def __init__(self, alist):
        OEUnaryAtomPred.__init__(self)
        self.atomiclist = alist

    def __call__(self, atom):
        return (atom.GetIdx() in self.atomiclist)

    def CreateCopy(self):
        # __disown__ is required to allow C++ to take ownership of this
        # object and its memory
        return AtomsByIndex(self.atomiclist).__disown__()
    
def inspect(mol, inconsistent, only_first = False):  
    counter = 0
    for key, value in inconsistent.iteritems():

        for i in key:
            oe, rdk = i
            print(value)
            print(oe)
            print(rdk)
            print(mol)
            
            #RDK images
            rdk = [i for j in  list(rdk) for i in j]
            highlights = [rdk]
            rdk_mol = Chem.MolFromSmiles(mol)
            rdk_mol = Chem.AddHs(rdk_mol)
            x = Draw.MolsToGridImage( [rdk_mol], molsPerRow = 1, highlightAtomLists=highlights,subImgSize=(500, 500)  )
            x.save("tmp1.png")


            #OE images
            oe = [i for j in  list(oe) for i in j]
            rep = OEGraphMol()
            OESmilesToMol(rep, mol)
            #OEAddExplicitHydrogens(rep)
            opts = OE2DMolDisplayOptions(500.0, 500.0, OEScale_AutoScale)
            
            #OEPrepareDepiction(rep)
            dopt = OEPrepareDepictionOptions()
            dopt.SetDepictOrientation( OEDepictOrientation_Horizontal)
            dopt.SetSuppressHydrogens(False)
            OEPrepareDepiction(rep, dopt)
            OEAddExplicitHydrogens(rep)
            OEGenerate2DCoordinates(rep)
            disp = OE2DMolDisplay(rep,opts)
            OEAddHighlighting(disp, OEColor(OERed), OEHighlightStyle_Stick, AtomsByIndex(oe))
            OERenderMolecule("tmp2.png", disp)
            
            # Combine the two
            
            output = merge_images("tmp2.png", "tmp1.png")
            display(output)
            #output.save(str(counter) + ".png")
            counter += 1
            if only_first:
                break
    os.remove("tmp1.png")        
    os.remove("tmp2.png")

In [5]:
ff = os.path.dirname(os.path.abspath(os.path.join('..'))) + "/openforcefield/data/forcefield/smirnoff99Frosst.ffxml"
file = open(ff, "r")
parser = etree.XMLParser(remove_blank_text = True)
tree= etree.parse(file, parser)
root = tree.getroot()


mol_filename = "/home/shuzhe/Documents/DrugBank_singlemol.mol2"

mol = oechem.OEGraphMol()
ifs = oechem.oemolistream(mol_filename)
flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield
ifs.SetFlavor( oechem.OEFormat_MOL2, flavor)
oechem.OEReadMolecule(ifs, mol )
oechem.OETriposAtomNames(mol)

rdkmol = Chem.MolFromMol2File(mol_filename, sanitize = True, removeHs = False)
print mol
print rdkmol

IOError: Bad input file /home/shuzhe/Documents/DrugBank_singlemol.mol2

# Torsion

In [8]:
# smirks = tree_parse(root, ["NonbondedForce"])
smirks = tree_parse(root, ["PeriodicTorsionForce"])
smirks


mol_filename = os.path.dirname(os.path.abspath(os.path.join('..'))) + "/openforcefield/data/molecules/DrugBank_tripos.mol2"
    
istream = oechem.oemolistream(mol_filename)
mol = oechem.OEMol()
flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield
istream.SetFlavor( oechem.OEFormat_MOL2, flavor)
oemols_tripos = []
while oechem.OEReadMolecule(istream, mol):
    oechem.OETriposAtomNames(mol)
    oemols_tripos.append(oechem.OEMol(mol))
istream.close()

counter = 0
smirks_count = {}
for mol2 in RetrieveMol2Block(open(mol_filename, "r")):
    rdkmol = Chem.MolFromMol2Block(mol2,  removeHs = False)
    if rdkmol == None:
        print "RDK parsing error"
        continue
    out = get_smirks_mapping_for_single_molecule(smirks, [oemols_tripos[counter], rdkmol])
    
    if bool(out):
        print Chem.MolToMolBlock(rdkmol)[0:20]    
        for i in smirks:
            if i in out:
                if i in smirks_count:
                    smirks_count[i] += 1
                else:
                    smirks_count[i] = 1
                        

    counter += 1
    

DrugBank_349
     RD
DrugBank_1420
     R
DrugBank_1543
     R
DrugBank_1671
     R
DrugBank_1800
     R
DrugBank_2029
     R
DrugBank_2344
     R
DrugBank_2450
     R
DrugBank_2567
     R
DrugBank_2798
     R
DrugBank_3051
     R
DrugBank_3090
     R
DrugBank_3267
     R
DrugBank_3448
     R
DrugBank_3505
     R
DrugBank_3632
     R
DrugBank_3869
     R
DrugBank_4346
     R
DrugBank_4523
     R
DrugBank_4552
     R
DrugBank_4676
     R
DrugBank_4684
     R
RDK parsing error
DrugBank_4706
     R
DrugBank_4708
     R
DrugBank_4709
     R
DrugBank_4710
     R
DrugBank_4711
     R
DrugBank_4712
     R
DrugBank_4713
     R
DrugBank_4714
     R
DrugBank_4715
     R
DrugBank_4717
     R
DrugBank_4718
     R
DrugBank_4719
     R
DrugBank_4720
     R
DrugBank_4721
     R
DrugBank_4726
     R
DrugBank_4727
     R
DrugBank_4728
     R
DrugBank_4730
     R
DrugBank_4732
     R
DrugBank_4733
     R
DrugBank_4734
     R
DrugBank_4735
     R
DrugBank_4736
     R
DrugBank_4737
     R
DrugBank_4738
  

In [27]:
# Bar chart count

print len(labels),len(smirks)
# plt.figure(figsize=(200,100))
# bar_width = 0.35
# labels = [i for i in smirks_count]
# counts = [smirks_count[i] for i in smirks_count]
# plt.bar(np.arange(len(labels)),counts, bar_width)
# plt.xticks(np.arange(len(labels)), labels)
# plt.show()

144 163


In [None]:
# smirks = tree_parse(root, ["NonbondedForce"])
smirks = tree_parse(root, ["PeriodicTorsionForce"])
smirks


mol_filename = os.path.dirname(os.path.abspath(os.path.join('..'))) + "/openforcefield/data/molecules/DrugBank_tripos.mol2"
    
istream = oechem.oemolistream(mol_filename)
mol = oechem.OEMol()
flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield
istream.SetFlavor( oechem.OEFormat_MOL2, flavor)
oemols_tripos = []
while oechem.OEReadMolecule(istream, mol):
    oechem.OETriposAtomNames(mol)
    oemols_tripos.append(oechem.OEMol(mol))
istream.close()

counter = 0
for mol2 in RetrieveMol2Block(open(mol_filename, "r")):
    rdkmol = Chem.MolFromMol2Block(mol2,  removeHs = False)
    if rdkmol == None:
        print "RDK parsing error"
        continue
    out = get_smirks_mapping_for_single_molecule(smirks, [oemols_tripos[counter], rdkmol])
    
    if bool(out):
        print Chem.MolToMolBlock(rdkmol)[0:20]
        for i in smirks:
            if i in out:
                print i
                print out[i][0]
                print out[i][1]
                print

    counter += 1
    

# Nonbonding

In [None]:
smirks = tree_parse(root, ["NonbondedForce"])
# smirks = tree_parse(root, ["PeriodicTorsionForce"])
smirks


mol_filename = os.path.dirname(os.path.abspath(os.path.join('..'))) + "/openforcefield/data/molecules/DrugBank_tripos.mol2"
    
istream = oechem.oemolistream(mol_filename)
mol = oechem.OEMol()
flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield
istream.SetFlavor( oechem.OEFormat_MOL2, flavor)
oemols_tripos = []
while oechem.OEReadMolecule(istream, mol):
    oechem.OETriposAtomNames(mol)
    oemols_tripos.append(oechem.OEMol(mol))
istream.close()

# counter = 0
# for mol2 in RetrieveMol2Block(open(mol_filename, "r")):
#     rdkmol = Chem.MolFromMol2Block(mol2,  removeHs = False)
#     if rdkmol == None:
#         print "RDK parsing error"
#         continue
#     out = get_smirks_mapping_for_single_molecule(smirks, [oemols_tripos[counter], rdkmol])
    
#     if bool(out):
#         print Chem.MolToMolBlock(rdkmol)[0:20]
#         for i in smirks:
#             if i in out:
#                 print i
#                 print out[i][0]
#                 print out[i][1]
#                 print

#     counter += 1
    

# BONDS

In [None]:
smirks = tree_parse(root, ["HarmonicBondForce"])
# smirks = tree_parse(root, ["PeriodicTorsionForce"])
smirks

out = get_smirks_mapping_for_single_molecule(smirks, [mol, rdkmol])
# for i in smirks:
#     if i in out:
#         print i
#         print out[i][0]
#         print out[i][1]
#         print

In [None]:
mol_filename = os.path.dirname(os.path.abspath(os.path.join('..'))) + "/openforcefield/data/molecules/DrugBank_tripos.mol2"
    
istream = oechem.oemolistream(mol_filename)
mol = oechem.OEMol()
flavor = oechem.OEIFlavor_Generic_Default | oechem.OEIFlavor_MOL2_Default | oechem.OEIFlavor_MOL2_Forcefield
istream.SetFlavor( oechem.OEFormat_MOL2, flavor)
oemols_tripos = []
while oechem.OEReadMolecule(istream, mol):
    oechem.OETriposAtomNames(mol)
    oemols_tripos.append(oechem.OEMol(mol))
istream.close()

# counter = 0
# for mol2 in RetrieveMol2Block(open(mol_filename, "r")):
#     rdkmol = Chem.MolFromMol2Block(mol2,  removeHs = False)
#     if rdkmol == None:
#         print "RDK parsing error"
#         continue
#     out = get_smirks_mapping_for_single_molecule(smirks, [oemols_tripos[counter], rdkmol])
    
#     if bool(out):
#         print Chem.MolToMolBlock(rdkmol)[0:20]
#         for i in smirks:
#             if i in out:
#                 print i
#                 print out[i][0]
#                 print out[i][1]
#                 print

#     counter += 1
    