In [1]:
import numpy as np
import sys
import gzip
from itertools import islice
import pandas as pd
import rdkit
from tqdm import tqdm


# Ignore rdkit warnings: https://github.com/rdkit/rdkit/issues/2683
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [2]:
print(rdkit.__version__)

2020.03.1


In [3]:
from warnings import warn
from rdkit.Chem.Descriptors import NumRadicalElectrons

def iter_mols():
    """ Iterate over the SDF file, yielding a pandas Series for each molecule.
    While this is likely not the fastest way to load the data, the SDF file
    format can be read by many different softwares, and the loop allows additional
    post-processing to occur while reading the molecules.
    
    Some errors for rotational constants are expected for molecules that have
    infinite values.
    
    Here, 20200415_radical_database.sdf.gz has been downloaded from figshare to
    the same directory.
    """
    
    with gzip.open('20200415_radical_database.sdf.gz') as sdffile:
        mol_suppl = rdkit.Chem.ForwardSDMolSupplier(sdffile, removeHs=False)
        for mol in tqdm(mol_suppl, total=289639):
            props = mol.GetPropsAsDict()
            props['mol'] = mol
            for item in ['AtomCharges', 'AtomSpins', 'VibFreqs', 'IRIntensity', 'RotConstants']:
                if item in props:
                    try:
                        props[item] = eval(props[item])
                    except NameError:
                        warn("Error with molecule {} property {}".format(props['SMILES'], item))
                    
            props['type'] = 'molecule' if NumRadicalElectrons(mol) == 0 else 'fragment'
            props['Name'] = mol.GetProp('_Name')
            yield pd.Series(props)

In [4]:
# Load the molecules into a pandas dataframe.
# This takes approximately 5 minutes for my compute node. Faster read/write
# speeds can be obtained for subsequent analysis by saving as a pickle or hdf5 file.
df = pd.DataFrame(iter_mols())

100%|██████████| 289639/289639 [06:09<00:00, 784.41it/s]


In [5]:
from rdkit import Chem
from tqdm import tqdm
tqdm.pandas()

def radical_type(smiles):
    """Return a description of the element and degree of the formal radical center"""
    mol = Chem.MolFromSmiles(smiles)
    atom = get_radical(mol)
    return pd.Series({'element': atom.GetSymbol(),
                      'degree': atom.GetDegree()})
    
    
def get_radical(mol):
    """Return the atom within a molecule containing the unpaired electron"""
    for atom in mol.GetAtoms():
        if atom.GetNumRadicalElectrons() > 0:
            return atom

  from pandas import Panel


In [6]:
# Split the database into radicals on different atom types, and count the results
frag_df = df[df.type == 'fragment']
radical_types = frag_df.SMILES.progress_apply(radical_type)
radical_types.pivot_table(index='element', columns='degree', aggfunc='size').drop('H').fillna(0).astype(int)

100%|██████████| 246363/246363 [01:50<00:00, 2221.99it/s]


degree,0,1,2,3
element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C,1,56067,121369,28135
N,1,11349,14084,0
O,2,15354,0,0


In [19]:
# For carbon-centered radicals, count how many match various radical types
allylic_radical = Chem.MolFromSmarts('[#6;X3v3+0]-[#6]=[#6X3]')
propargylic_radical = Chem.MolFromSmarts('[#6;X3v3+0]-[#6]#[#6]')
benzylic_radical = Chem.MolFromSmarts('[#6;X3v3+0]-[c]')
alpha_to_ewg = Chem.MolFromSmarts('[#6;X3v3+0]-[C,N]=,#[N,O]')
alpha_to_edg = Chem.MolFromSmarts('[#6;X3v3+0]-[O,N]')
captodative = Chem.MolFromSmarts('[#6;X3v3+0](-[O,N])-[C,N]=,#[N,O]')


def match_smarts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return pd.Series({
        'allylic': mol.HasSubstructMatch(allylic_radical),
        'propargylic': mol.HasSubstructMatch(propargylic_radical),
        'benzylic': mol.HasSubstructMatch(benzylic_radical),
        'alpha_to_ewg': mol.HasSubstructMatch(alpha_to_ewg),
        'alpha_to_edg': mol.HasSubstructMatch(alpha_to_edg),
        'captodative': mol.HasSubstructMatch(captodative)
    })

radical_matches = frag_df.SMILES.progress_apply(match_smarts)

100%|██████████| 246363/246363 [01:54<00:00, 2145.89it/s]


In [20]:
radical_matches.sum(0)

allylic         16229
propargylic      1887
benzylic         8286
alpha_to_ewg    18758
alpha_to_edg    55136
captodative      4386
dtype: int64

In [14]:
odd = radical_matches[radical_matches['captodative'] != (radical_matches['alpha_to_ewg'] & radical_matches['alpha_to_edg'])]

In [15]:
radical_matches.reindex(odd.index)

Unnamed: 0,allylic,propargylic,benzylic,alpha_to_ewg,alpha_to_edg,captodative
1896,False,False,False,True,True,False
4975,False,False,False,True,True,False
6086,False,False,False,True,True,False
6560,True,False,False,True,True,False
6652,True,False,False,True,True,False
...,...,...,...,...,...,...
283028,False,False,False,True,True,False
286791,False,False,False,True,True,False
287186,False,False,False,True,True,False
288837,True,False,False,True,True,False


In [16]:
frag_df.reindex(odd.index)

Unnamed: 0,SMILES,Enthalpy,FreeEnergy,SCFEnergy,AtomCharges,RotConstants,VibFreqs,IRIntensity,mol,type,Name,AtomSpins
1896,[CH2]N=O,-169.136098,-169.164813,-169.172584,"[-0.239451, 0.180677, -0.231109, 0.143008, 0.1...","[120.14351189, 7.87392583, 7.5252908]","[456.9416, 715.7201, 864.7123, 1141.6749, 1284...","[2.5171, 0.195, 61.4921, 21.7009, 40.4606, 2.0...",<rdkit.Chem.rdchem.Mol object at 0x7fa319069490>,fragment,2485_0,"[-0.07221, 0.437526, 0.575327, 0.048876, 0.010..."
4975,[CH2]/N=N/N(C)C(=O)NC,-451.826973,-451.875122,-451.983480,"[-0.184106, -0.185088, -0.131194, 0.01195, -0....","[3.01737514, 1.03149802, 0.77573072]","[52.0856, 94.8721, 104.0051, 141.0771, 156.220...","[1.234, 6.0998, 2.9798, 13.5744, 1.0741, 0.557...",<rdkit.Chem.rdchem.Mol object at 0x7fa2bb17e300>,fragment,6215_0,"[0.460597, -0.188341, 0.50849, 0.193117, -0.01..."
6086,[CH]1C=N1,-131.965859,-131.994140,-132.001768,"[-0.157061, 0.011078, -0.172444, 0.147074, 0.1...","[30.56007425, 26.23028452, 14.87463341]","[758.3426, 840.5035, 954.0973, 980.3461, 1054....","[60.4474, 19.2726, 12.2953, 1.7124, 22.4289, 0...",<rdkit.Chem.rdchem.Mol object at 0x7fa2ba7f9a80>,fragment,7628_0,"[0.853147, -0.066797, 0.191501, 0.02583, -0.00..."
6560,[CH]1C=CN=N1,-225.444863,-225.476622,-225.505740,"[-0.011781, -0.214143, -0.012588, -0.127983, -...","[10.443884, 7.52718929, 4.37507198]","[459.3126, 522.5265, 585.7645, 754.6024, 884.6...","[33.6166, 98.7078, 0.0034, 31.6484, 0.0497, 5....",<rdkit.Chem.rdchem.Mol object at 0x7fa2ba43ddf0>,fragment,10653_0,"[0.475084, -0.150931, 0.46876, 0.11852, 0.1147..."
6652,[CH]1C=CC=CN=N1,-302.802447,-302.838745,-302.899821,"[-0.053738, -0.183458, -0.060961, -0.183393, -...","[4.15057034, 3.52386506, 1.96753131]","[191.643, 212.546, 397.52, 423.7564, 490.8819,...","[2.2498, 8.327, 1.0814, 10.4923, 11.4881, 3.81...",<rdkit.Chem.rdchem.Mol object at 0x7fa2ba348440>,fragment,10810_0,"[-0.10901, 0.500127, -0.190678, 0.499819, -0.1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
283028,[CH2]/N=N/N(C)C(=O)OCC,-510.983607,-511.033777,-511.157179,"[-0.183166, -0.158618, -0.09359, -0.019793, -0...","[2.07935383, 0.80034976, 0.58477543]","[57.0922, 90.7949, 105.7194, 112.7013, 140.295...","[0.2075, 0.4782, 2.852, 0.8516, 11.3571, 0.861...",<rdkit.Chem.rdchem.Mol object at 0x7fa21d7f95d0>,fragment,516256_28b5d0,"[0.490814, -0.19868, 0.539773, 0.131223, -0.01..."
286791,O=N[C]1CCCCC1,-364.361866,-364.402326,-364.526716,"[-0.277615, 0.152135, 0.094731, -0.337801, -0....","[3.94098565, 1.31896166, 1.07618875]","[83.2245, 178.7958, 248.0376, 292.9028, 383.72...","[0.941, 1.567, 1.0221, 0.439, 0.1337, 1.597, 0...",<rdkit.Chem.rdchem.Mol object at 0x7fa21ba751c0>,fragment,520642_da3503,"[0.585606, 0.426291, -0.086444, 0.061088, -0.0..."
287186,O=N[C]1CCNCC1,-380.407799,-380.447953,-380.561375,"[-0.275843, 0.149599, 0.087175, -0.285688, -0....","[4.11753401, 1.37155988, 1.12577291]","[84.5694, 191.716, 249.9321, 309.2608, 387.059...","[1.6604, 2.8551, 0.2094, 2.126, 1.6374, 1.2704...",<rdkit.Chem.rdchem.Mol object at 0x7fa21b7384e0>,fragment,521094_895edf,"[0.586635, 0.425315, -0.084684, 0.030199, -0.0..."
288837,O=N[C]1C=CC=C1,-322.684194,-322.720507,-322.770167,"[-0.210403, 0.189167, 0.103653, -0.217508, -0....","[7.74762416, 1.86988501, 1.52799841]","[163.3934, 210.4404, 416.7147, 509.3522, 575.1...","[0.0031, 0.8742, 0.2747, 5.8403, 1.7, 22.09, 1...",<rdkit.Chem.rdchem.Mol object at 0x7fa21a96fb70>,fragment,522942_6f028d,"[0.597355, 0.468705, -0.088166, 0.03088, -0.03..."
