In [1]:
import numpy as np
import sys
import gzip
from itertools import islice
import pandas as pd
import rdkit
from tqdm import tqdm


# Ignore rdkit warnings: https://github.com/rdkit/rdkit/issues/2683
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [2]:
print(rdkit.__version__)

2020.03.1


In [3]:
from warnings import warn
from rdkit.Chem.Descriptors import NumRadicalElectrons

def iter_mols():
    """ Iterate over the SDF file, yielding a pandas Series for each molecule.
    While this is likely not the fastest way to load the data, the SDF file
    format can be read by many different softwares, and the loop allows additional
    post-processing to occur while reading the molecules.
    
    Some errors for rotational constants are expected for molecules that have
    infinite values.
    
    Here, 20200415_radical_database.sdf.gz has been downloaded from figshare to
    the same directory.
    """
    
    with gzip.open('20200415_radical_database.sdf.gz') as sdffile:
        mol_suppl = rdkit.Chem.ForwardSDMolSupplier(sdffile, removeHs=False)
        for mol in tqdm(mol_suppl, total=289639):
            props = mol.GetPropsAsDict()
            props['mol'] = mol
            for item in ['AtomCharges', 'AtomSpins', 'VibFreqs', 'IRIntensity', 'RotConstants']:
                if item in props:
                    try:
                        props[item] = eval(props[item])
                    except NameError:
                        warn("Error with molecule {} property {}".format(props['SMILES'], item))
                    
            props['type'] = 'molecule' if NumRadicalElectrons(mol) == 0 else 'fragment'
            props['Name'] = mol.GetProp('_Name')
            yield pd.Series(props)

In [4]:
# Load the molecules into a pandas dataframe.
# This takes approximately 5 minutes for my compute node. Faster read/write
# speeds can be obtained for subsequent analysis by saving as a pickle or hdf5 file.
df = pd.DataFrame(iter_mols())

100%|██████████| 289639/289639 [06:09<00:00, 784.41it/s]


In [5]:
from rdkit import Chem
from tqdm import tqdm
tqdm.pandas()

def radical_type(smiles):
    """Return a description of the element and degree of the formal radical center"""
    mol = Chem.MolFromSmiles(smiles)
    atom = get_radical(mol)
    return pd.Series({'element': atom.GetSymbol(),
                      'degree': atom.GetDegree()})
    
    
def get_radical(mol):
    """Return the atom within a molecule containing the unpaired electron"""
    for atom in mol.GetAtoms():
        if atom.GetNumRadicalElectrons() > 0:
            return atom

  from pandas import Panel


In [6]:
# Split the database into radicals on different atom types, and count the results
frag_df = df[df.type == 'fragment']
radical_types = frag_df.SMILES.progress_apply(radical_type)
radical_types.pivot_table(index='element', columns='degree', aggfunc='size').drop('H').fillna(0).astype(int)

100%|██████████| 246363/246363 [01:50<00:00, 2221.99it/s]


degree,0,1,2,3
element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C,1,56067,121369,28135
N,1,11349,14084,0
O,2,15354,0,0


In [7]:
# For carbon-centered radicals, count how many match various radical types
allylic_radical = Chem.MolFromSmarts('[#6;X3v3+0]-[#6]=[#6X3]')
propargylic_radical = Chem.MolFromSmarts('[#6;X3v3+0]-[#6]#[#6]')
benzylic_radical = Chem.MolFromSmarts('[#6;X3v3+0]-[c]')
alpha_to_ewg = Chem.MolFromSmarts('[#6;X3v3+0]-[C,N]=,#[N,O]')
alpha_to_edg = Chem.MolFromSmarts('[#6;X3v3+0]-[O,N]')

def match_smarts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return pd.Series({
        'allylic': mol.HasSubstructMatch(allylic_radical),
        'propargylic': mol.HasSubstructMatch(propargylic_radical),
        'benzylic': mol.HasSubstructMatch(benzylic_radical),
        'alpha_to_ewg': mol.HasSubstructMatch(alpha_to_ewg),
        'alpha_to_edg': mol.HasSubstructMatch(alpha_to_edg),
    })

radical_matches = frag_df.SMILES.progress_apply(match_smarts)
radical_matches['captodative'] = radical_matches['alpha_to_edg'] & radical_matches['alpha_to_ewg']

100%|██████████| 246363/246363 [01:56<00:00, 2122.44it/s]


In [8]:
radical_matches.sum(0)

allylic         16229
propargylic      1887
benzylic         8286
alpha_to_ewg    18758
alpha_to_edg    55136
captodative      4675
dtype: int64