# **Descriptor**

**Author:** Raissa Lohanna

**Date:** May 16th, 2023

**Objective:** Generate other simple molecular descriptors that only depend on SMILES representation

## Importing libraries and configuration

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from rdkit import Chem
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
tqdm.pandas()

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
sns.set_theme(style="darkgrid", palette="husl", rc={"figure.figsize":(10, 5)})

In [6]:
# import packages
from rdkit import Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

Load data

In [8]:
df = pd.read_parquet("joined_data.parquet")

FileNotFoundError: [Errno 2] No such file or directory: 'joined_data.parquet'

In [7]:
# choose 200 molecular descriptors
chosen_descriptors = [
    'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 
    'Chi3v', 'Chi4n', 'Chi4v', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 
    'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'ExactMolWt', 
    'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha', 'HeavyAtomCount', 
    'HeavyAtomMolWt', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex', 'MaxAbsPartialCharge', 
    'MaxEStateIndex', 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge', 'MinEStateIndex', 
    'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 
    'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 
    'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons', 'NumRotatableBonds', 
    'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumValenceElectrons', 'PEOE_VSA1', 
    'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 
    'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'RingCount', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 
    'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 
    'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 
    'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 
    'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'fr_Al_COO', 'fr_Al_OH', 
    'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 
    'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 
    'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 
    'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 
    'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 
    'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 
    'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 
    'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 
    'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 
    'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 
    'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 
    'fr_unbrch_alkane', 'fr_urea', 'qed']


In [8]:
def calculate_descriptors(smi, chosen_descriptors):

    # convert SMILES string to RDKit mol object
    mol = Chem.MolFromSmiles(smi)

    # create molecular descriptor calculator
    mol_descriptor_calculator = MolecularDescriptorCalculator(chosen_descriptors)

    # use molecular descriptor calculator on RDKit mol object
    list_of_descriptor_vals = list(mol_descriptor_calculator.CalcDescriptors(mol))
    
    #print(list_of_descriptor_vals)
    
    return list_of_descriptor_vals

In [9]:
descriptor_names = list(Chem.rdMolDescriptors.Properties.GetAvailableProperties())

get_descriptors = Chem.rdMolDescriptors.Properties(descriptor_names)

In [11]:
def smi_to_descriptors(smile):
    mol = Chem.MolFromSmiles(smile)
    descriptors = []
    if mol:
        descriptors = np.array(get_descriptors.ComputeProperties(mol))
    return descriptors

In [12]:
from math import sqrt
from joblib import Parallel, delayed

In [13]:
df['descriptors'] = Parallel(n_jobs=-1, prefer="threads", verbose=10)(
    delayed(smi_to_descriptors)
    (smi) 
    for smi in df['isomeric_smiles']
    )

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapse

In [14]:
df['descriptors']

cid
1           [203.115758024, 203.23799999999997, 5.0, 0.0, ...
2           [204.12303447609, 204.24599999999998, 5.0, 1.0...
3           [156.042258736, 156.137, 4.0, 3.0, 1.0, 3.0, 3...
4           [75.068413908, 75.11099999999999, 2.0, 3.0, 1....
5           [169.014008986, 169.073, 6.0, 4.0, 4.0, 3.0, 4...
                                  ...                        
75276555    [210.09391744799998, 210.306, 4.0, 2.0, 4.0, 3...
75277118    [235.076391748, 235.714, 2.0, 0.0, 4.0, 0.0, 1...
75277120    [215.131014164, 215.29599999999994, 2.0, 0.0, ...
75277121    [235.120843404, 235.283, 4.0, 0.0, 4.0, 0.0, 3...
75277270    [236.094963004, 236.274, 3.0, 0.0, 1.0, 0.0, 2...
Name: descriptors, Length: 3332473, dtype: object

In [15]:
descriptor_names

['exactmw',
 'amw',
 'lipinskiHBA',
 'lipinskiHBD',
 'NumRotatableBonds',
 'NumHBD',
 'NumHBA',
 'NumHeavyAtoms',
 'NumAtoms',
 'NumHeteroatoms',
 'NumAmideBonds',
 'FractionCSP3',
 'NumRings',
 'NumAromaticRings',
 'NumAliphaticRings',
 'NumSaturatedRings',
 'NumHeterocycles',
 'NumAromaticHeterocycles',
 'NumSaturatedHeterocycles',
 'NumAliphaticHeterocycles',
 'NumSpiroAtoms',
 'NumBridgeheadAtoms',
 'NumAtomStereoCenters',
 'NumUnspecifiedAtomStereoCenters',
 'labuteASA',
 'tpsa',
 'CrippenClogP',
 'CrippenMR',
 'chi0v',
 'chi1v',
 'chi2v',
 'chi3v',
 'chi4v',
 'chi0n',
 'chi1n',
 'chi2n',
 'chi3n',
 'chi4n',
 'hallKierAlpha',
 'kappa1',
 'kappa2',
 'kappa3',
 'Phi']

In [16]:
len(descriptor_names)

43

In [17]:
len(df['descriptors'].loc[1])

43

In [18]:
df['descriptors'].isnull().sum()

0

In [10]:
df = pd.read_parquet("../Datasets/joined_w_rdkit_descriptors.parquet")

In [11]:
df = df.reset_index()[['cid', 'descriptors']]

In [12]:
df.head()

Unnamed: 0,cid,descriptors
0,1,"[203.115758024, 203.23799999999997, 5.0, 0.0, ..."
1,2,"[204.12303447609, 204.24599999999998, 5.0, 1.0..."
2,3,"[156.042258736, 156.137, 4.0, 3.0, 1.0, 3.0, 3..."
3,4,"[75.068413908, 75.11099999999999, 2.0, 3.0, 1...."
4,5,"[169.014008986, 169.073, 6.0, 4.0, 4.0, 3.0, 4..."


In [17]:
df = df.join(pd.DataFrame(df.descriptors.tolist(), index=df.index, columns=descriptor_names)).drop(columns='descriptors')

In [18]:
df.head()

Unnamed: 0,cid,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,NumHBD,NumHBA,NumHeavyAtoms,NumAtoms,NumHeteroatoms,NumAmideBonds,FractionCSP3,NumRings,NumAromaticRings,NumAliphaticRings,NumSaturatedRings,NumHeterocycles,NumAromaticHeterocycles,NumSaturatedHeterocycles,NumAliphaticHeterocycles,NumSpiroAtoms,NumBridgeheadAtoms,NumAtomStereoCenters,NumUnspecifiedAtomStereoCenters,labuteASA,tpsa,CrippenClogP,CrippenMR,chi0v,chi1v,chi2v,chi3v,chi4v,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,1,203.115758,203.238,5.0,0.0,5.0,0.0,4.0,14.0,31.0,5.0,0.0,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,83.859416,66.43,-1.2357,48.0774,9.071771,4.380117,1.478325,1.478325,1.17499,9.071771,4.380117,1.478325,1.478325,1.17499,-1.1,12.9,4.950243,7.544022,4.561296
1,2,204.123034,204.246,5.0,1.0,5.0,1.0,3.0,14.0,32.0,5.0,0.0,0.777778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,83.859416,63.6,0.099,50.7062,9.110736,4.3996,1.486279,1.486279,1.183861,9.110736,4.3996,1.486279,1.486279,1.183861,-1.1,12.9,4.950243,7.544022,4.561296
2,3,156.042259,156.137,4.0,3.0,1.0,3.0,3.0,11.0,19.0,4.0,0.0,0.285714,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,63.08809,77.76,-0.711,36.8724,5.63664,3.104812,1.47316,1.47316,0.858665,5.63664,3.104812,1.47316,1.47316,0.858665,-1.13,7.971317,2.85575,1.470981,2.069463
3,4,75.068414,75.111,2.0,3.0,1.0,2.0,2.0,5.0,14.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,31.60326,46.25,-0.6741,20.7352,3.309021,1.652046,0.341112,0.341112,0.0,3.309021,1.652046,0.341112,0.341112,0.0,-0.08,4.92,2.175102,3.92,2.1403
4,5,169.014009,169.073,6.0,4.0,4.0,3.0,4.0,10.0,18.0,7.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57.222171,109.85,-1.3765,32.0575,5.952377,3.903599,1.086832,1.086832,0.527175,5.057949,2.373303,0.621383,0.621383,0.215009,-0.26,9.74,3.798894,5.869199,3.700123


In [19]:
df.shape

(3332473, 44)

## Saving

In [20]:
df.to_parquet("../Datasets/rdkit_descriptors.parquet")