## Notebook for calculating various molecular desciptors

In [7]:
# Imports
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem
import os
from mordred import Calculator, descriptors
import pandas as pd
from more_itertools import chunked
from tqdm.notebook import tqdm
import numpy as np
# Capturing RDKit errors
from io import StringIO
from rdkit.Chem.MolStandardize import standardize_smiles
import sys
# For DScribe descriptors
import ase
from dscribe import descriptors as ddescriptors
from collections import defaultdict

In [4]:
def mol2_to_mol(file=None, sanitize=True):
    mols={}
    with open(file, 'r') as f:
        line =f.readline()
        # Counter represents the unique index for molecules.
        counter = 0
        broken_counter = 0
        while not f.tell() == os.fstat(f.fileno()).st_size:
            if line.startswith("@<TRIPOS>MOLECULE"):
                mol = []
                mol.append(line)
                line = f.readline()
                while not line.startswith("@<TRIPOS>MOLECULE"):
                    mol.append(line)
                    line = f.readline()
                    if f.tell() == os.fstat(f.fileno()).st_size:
                        mol.append(line)
                        break
                mol[-1] = mol[-1].rstrip() # removes blank line at file end
                block = ",".join(mol).replace(',','')
                m=Chem.MolFromMol2Block(block, sanitize=sanitize, removeHs=False)
            mols[counter] = m
            counter += 1
    return mols

In [None]:
def get_3d_geometries():
    

In [3]:
def parse_molecules(mols):
    # Removes molecules that generate warnings when parsing through RDKit
    # This could lead to incorrect values depending on how the descriptor
    # is calculated.
    working_mols = {}
    nonworking_mols = {}
    sio = sys.stderr = StringIO()
    for idx, mol in mols.items():
        if mol is None:
            nonworking_mols[idx] = mol
            continue
        Chem.SanitizeMol(mol)
        res = sio.getvalue()
        if 'WARNING' in res:
            nonworking_mols[idx] = mol
            print(res)
            # Reset stderr
            sio = sys.stderr = StringIO()
        else:
            working_mols[idx] = mol
            sio = sys.stderr = StringIO()
    return working_mols, nonworking_mols

def calculate_descriptors_pandas(mols, check_mols=True):
    # Calculates 2D and 3D descriptors using Mordred.
    # Returns a DataFrame containing descriptors.
    calc = Calculator(descriptors, ignore_3D=False)
    if check_mols:
        mols, _ = parse_molecules(mols)
        print(f'There are {len(_)} non-working molecules.')
    df = calc.pandas(list(mols.values()), ipynb=True, quiet=False)
    df.index.name = 'Molecule_Number'
    df['SMILES'] = [Chem.MolToSmiles(m) for m in list(mols.values())]
    df['RDKit_Molecule'] = [m for m in list(mols.values())]
    df.index = list(mols.keys())
    return df

In [4]:
def calculate_fingerprints(mols):
    # Converts an RDKit molecule into a fingerprint. 
    # This function is only given as an example of a fingerprint calculation.
    # There are multiple different approaches to calculate molecular fingerprints.
    # A radius 2 and bit length of 1024 was chosen for this calculations.
    fps = []
    for mol in mols:
        fp = np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024))
        fps.append(fp)
    return fps

In [5]:
# Calculating DScribe descriptors
def calculate_dscribe_descriptors(mols):
    dscribe_descriptors = defaultdict(list)
    for mol in tqdm(mols):
        mol_block = StringIO(Chem.MolToMolBlock(mol))
        ase_mol = ase.io.mol.read_mol(mol_block)
        unique_species = list(set(ase_mol.get_chemical_symbols()))
        # Setting up the DScribe descriptors
        # The parameters for all these descriptors are unoptimised.
        cm = ddescriptors.CoulombMatrix(
            n_atoms_max=1000,
        )
        cm_result = cm.create(ase_mol, n_jobs=-1)
        sm = ddescriptors.SineMatrix(
            n_atoms_max=1000,
            permutation="sorted_l2",
            sparse=False,
            flatten=True
        )
        sm_result = cm.create(ase_mol, n_jobs=-1)
        dscribe_descriptors['Coulomb_Matrix'].append(cm_result)
        dscribe_descriptors['Sine_Matrix'].append(sm_result)
    return dscribe_descriptors

In [26]:
geometries = {}
embed_params = Chem.rdDistGeom.ETKDGv3()
for mol in tqdm(mols.items()):
    try:
        smiles = standardize_smiles(unstandardised_smiles)
    except:
        print('Skipping')
        continue
    if mol[1] is None:
        continue
    if smiles in geometries:
        # Get the RDKit molecule with conformers already created.
        rdmol = geometries[smiles]
    else:
        rdmol = mol[1]
        conf_ids = Chem.rdDistGeom.EmbedMultipleConfs(rdmol, 10, embed_params)


  0%|          | 0/29374 [00:00<?, ?it/s]

RDKit ERROR: [14:35:50] UFFTYPER: Unrecognized charge state for atom: 24
RDKit ERROR: [14:35:50] UFFTYPER: Unrecognized charge state for atom: 25
RDKit ERROR: [14:35:51] Can't kekulize mol.  Unkekulized atoms: 3 9
RDKit ERROR: 
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_5 (0)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (1)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_5 (2)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (3)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (4)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (5)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (19)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (20)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (21)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (3)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecognized atom type: B_6 (4)
RDKit ERROR: [14:35:51] UFFTYPER: Unrecogni

Skipping
Skipping
Skipping
Skipping


RDKit ERROR: [14:36:15] UFFTYPER: Unrecognized atom type: C_6 (1)
RDKit ERROR: [14:36:15] UFFTYPER: Unrecognized atom type: C_6 (2)
RDKit ERROR: [14:36:15] UFFTYPER: Unrecognized atom type: C_6 (4)
RDKit ERROR: [14:36:15] UFFTYPER: Unrecognized atom type: C_6 (6)
RDKit ERROR: [14:36:15] UFFTYPER: Unrecognized atom type: C_6 (8)
RDKit ERROR: [14:36:16] UFFTYPER: Unrecognized charge state for atom: 5
RDKit ERROR: [14:36:16] UFFTYPER: Unrecognized charge state for atom: 21
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized atom type: Se2+2 (0)
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized atom type: Se2+2 (1)
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized atom type: Se2+2 (13)
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized atom type: Se2+2 (14)
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized atom type: Se2+2 (0)
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized atom type: Se2+2 (1)
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized atom type: Se2+2 (13)
RDKit ERROR: [14:36:17] UFFTYPER: Unrecognized

Skipping


RDKit ERROR: [14:36:32] UFFTYPER: Unrecognized charge state for atom: 0


Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping


RDKit ERROR: [14:36:33] UFFTYPER: Unrecognized atom type: O_5 (2)
RDKit ERROR: [14:36:33] UFFTYPER: Unrecognized atom type: N_5 (3)
RDKit ERROR: [14:36:33] UFFTYPER: Unrecognized atom type: C_6 (4)
RDKit ERROR: [14:36:33] UFFTYPER: Unrecognized atom type: C_6 (5)
RDKit ERROR: [14:36:33] UFFTYPER: Unrecognized atom type: C_6 (6)


Skipping
Skipping


RDKit ERROR: [14:36:41] UFFTYPER: Unrecognized atom type: O_5 (1)
RDKit ERROR: [14:36:41] UFFTYPER: Unrecognized atom type: C_6 (3)
RDKit ERROR: [14:36:41] UFFTYPER: Unrecognized atom type: O_5 (4)
RDKit ERROR: [14:36:41] UFFTYPER: Unrecognized atom type: C_6 (11)
RDKit ERROR: [14:36:41] UFFTYPER: Unrecognized atom type: C_6 (12)
RDKit ERROR: [14:36:43] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:36:43] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:36:43] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:36:43] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:36:43] UFFTYPER: Unrecognized charge state for atom: 1
RDKit ERROR: [14:36:44] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:36:44] UFFTYPER: Unrecognized atom type: Se2+2 (0)
RDKit ERROR: [14:36:44] UFFTYPER: Unrecognized atom type: B_6 (4)
RDKit ERROR: [14:36:44] UFFTYPER: Unrecognized atom type: B_6 (5)
RDKit ERROR: [14:36:44] UFFTYPER: Un

Skipping


RDKit ERROR: [14:37:21] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:37:23] UFFTYPER: Unrecognized atom type: C_5 (0)
RDKit ERROR: [14:37:23] UFFTYPER: Unrecognized atom type: C_5 (1)
RDKit ERROR: [14:37:23] UFFTYPER: Unrecognized atom type: C_6 (2)
RDKit ERROR: [14:37:23] UFFTYPER: Unrecognized atom type: N_6 (3)
RDKit ERROR: [14:37:25] UFFTYPER: Unrecognized atom type: B_1 (0)
RDKit ERROR: [14:37:26] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:37:26] UFFTYPER: Unrecognized atom type: C_5 (2)
RDKit ERROR: [14:37:26] UFFTYPER: Unrecognized atom type: C_6 (3)
RDKit ERROR: [14:37:26] UFFTYPER: Unrecognized atom type: C_6 (4)
RDKit ERROR: [14:37:26] UFFTYPER: Unrecognized atom type: C_6 (11)
RDKit ERROR: [14:37:26] UFFTYPER: Unrecognized atom type: C_5 (12)


Skipping


RDKit ERROR: [14:37:27] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:37:27] UFFTYPER: Unrecognized charge state for atom: 2
RDKit ERROR: [14:37:29] UFFTYPER: Unrecognized atom type: Se2+2 (1)


Skipping
Skipping


RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (1)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (2)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (3)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (4)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (5)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (6)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized charge state for atom: 9
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized charge state for atom: 10
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (0)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized hybridization for atom: 1
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_ (1)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (2)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (3)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type: C_6 (4)
RDKit ERROR: [14:37:32] UFFTYPER: Unrecognized atom type:

Skipping


RDKit ERROR: [14:37:39] UFFTYPER: Unrecognized charge state for atom: 0
RDKit ERROR: [14:37:40] Explicit valence for atom # 23 N, 5, is greater than permitted


AtomValenceException: Explicit valence for atom # 23 N, 5, is greater than permitted

In [None]:
molecules_formatted = defaultdict(list)
for mol in mols.items():
    smiles = standardize_smiles(Chem.MolToSmiles(mol[1]))
    molecules_formatted["SMILES"].append(smiles)
    molecules_formatted["Index"].append(mol[0])

In [5]:
# Code cell performs all descriptor calculations
mols = mol2_to_mol('../small_molecule_search.mol2')
# df = calculate_descriptors_pandas(mols, ctryheck_mols=True)
# df = pd.DataFrame(df)
# df['Fingerprints'] = calculate_fingerprints(df['RDKit_Molecule'])
# # df_dscribe_descriptors = pd.DataFrame(df)

RDKit ERROR: [13:11:17] Explicit valence for atom # 5 C, 7, is greater than permitted
RDKit ERROR: [13:11:18] Can't kekulize mol.  Unkekulized atoms: 0 1 2 4 6 7
RDKit ERROR: 
RDKit ERROR: [13:11:18] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 6 7
RDKit ERROR: 
RDKit ERROR: [13:11:18] Explicit valence for atom # 0 C, 6, is greater than permitted
RDKit ERROR: [13:11:18] Explicit valence for atom # 12 C, 6, is greater than permitted
RDKit ERROR: [13:11:19] Explicit valence for atom # 0 C, 6, is greater than permitted
RDKit ERROR: [13:11:19] Explicit valence for atom # 3 C, 5, is greater than permitted
RDKit ERROR: [13:11:19] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
RDKit ERROR: 
RDKit ERROR: [13:11:19] Explicit valence for atom # 9 C, 5, is greater than permitted
RDKit ERROR: [13:11:19] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [13:11:19] Explicit valence for atom # 8 C, 6, is greater than permitted
RDKit ERROR: [13:11:19] Can't kekulize mol.

In [None]:
df.to_pickle('Calculated_Descriptors.pkl')

In [None]:
df_pickled = pd.read_pickle('Calculated_Descriptors.pkl')

In [43]:
dscribe_descriptors = calculate_dscribe_descriptors(df['RDKit_Molecule'].to_list())

  0%|          | 0/29079 [00:00<?, ?it/s]

In [None]:
len(df['SMILES'].drop_duplicates())-len(df)