In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import clear_output

In [3]:
np.random.seed(0)

# Extract Data

In [4]:
import tabula
supp_file_path='d0se00687d1.pdf'
data = tabula.read_pdf(supp_file_path, pages = "7-10",multiple_datas = False)
data.columns=['Sr. No.','FG','FG Position', 'Redox Potential']

def mapFG(x):
    if x=='N(CH3)2':
        return '2CH3N'
    else:
        return x
data['FG']=data['FG'].map(mapFG)
data.rename(columns={'FG':'FG_name'},inplace=True)
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential
0,Phenazine,-,-,-1.74
1,1,2CH3N,1,-1.85
2,2,2CH3N,2,-1.98
3,3,NH2,1,-1.85
4,4,NH2,2,-1.92
...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35
185,186,NO2,1 4 6 9,-0.52
186,187,NO2,2 3 7 8,-0.52
187,188,NO2,1 2 3 4 6 9,0.16


In [5]:
cords_file_path='SI_coordinates_vs1_20200617.txt'
lines=open(cords_file_path,'r').readlines()

mol_dict_xyz={}
key=None
for line in lines:
    if 'neutral' in line.lower() or 'anion' in line.lower():
        key='_'.join(line.strip().split())
        mol_dict_xyz[key]=[]
    elif line=='\n':
        continue
    else:
        mol_dict_xyz[key].append('\t'.join(line.split()))

for key,value in mol_dict_xyz.items():
    mol_dict_xyz[key]=str(len(value))+'\n'+key+'\n'+'\n'.join(value)

print(f'Total DATA_molecules Found = {len(mol_dict_xyz)}')
print('Key = '+next(iter(mol_dict_xyz.keys())))
mol_dict_xyz['PHENAZINE_neutral']

Total DATA_molecules Found = 370
Key = PHENAZINE_neutral


'22\nPHENAZINE_neutral\nN\t0.00000\t1.42349\t0.00005\nN\t0.00000\t-1.42349\t-0.00005\nC\t1.14528\t0.72406\t-0.00000\nC\t1.14528\t-0.72406\t-0.00006\nC\t-1.14528\t-0.72406\t0.00000\nC\t-1.14528\t0.72406\t0.00006\nC\t2.39571\t-1.41739\t-0.00011\nC\t2.39571\t1.41739\t-0.00001\nC\t-2.39571\t-1.41739\t0.00001\nC\t-2.39571\t1.41739\t0.00011\nC\t3.57334\t0.71483\t-0.00006\nC\t3.57334\t-0.71483\t-0.00012\nC\t-3.57334\t0.71483\t0.00012\nC\t-3.57334\t-0.71483\t0.00006\nH\t2.37152\t-2.50248\t-0.00015\nH\t2.37152\t2.50248\t0.00003\nH\t-2.37152\t-2.50248\t-0.00003\nH\t-2.37152\t2.50248\t0.00015\nH\t4.52148\t-1.24448\t-0.00016\nH\t4.52148\t1.24448\t-0.00007\nH\t-4.52148\t-1.24448\t0.00007\nH\t-4.52148\t1.24448\t0.00016'

In [6]:
from openbabel import pybel

output_dir='DATA_molecules'
for k,v in mol_dict_xyz.items():
    mol=pybel.readstring('xyz',v)
    if 'neutral' in k.lower():
        mol.write('mol', f'{output_dir}/neutral/{k}.mol',overwrite=True)
    elif 'anion' in k.lower():
        mol.write('mol', f'{output_dir}/anion/{k}.mol',overwrite=True)

In [7]:
mol_filenames=data['FG_name']+data['FG Position'].apply(lambda x:'_R'+''.join(x.split()))
mol_filenames[0]='Phenazine'
mol_filename_neutral=[]
mol_filename_anion=[]

for filename in mol_filenames:
    neutral_key=None
    anion_key=None
    for key in mol_dict_xyz.keys():
        if (filename+'_neutral').lower() == key.lower():
            neutral_key=key
            mol_filename_neutral.append(neutral_key)
            break
    if neutral_key is None:
        mol_filename_neutral.append(np.nan)
        print('error (neutral)- '+filename)
    for key in mol_dict_xyz.keys():
        if (filename+'_anion').lower() == key.lower():
            anion_key=key
            mol_filename_anion.append(anion_key)
            break
    if anion_key is None:
        mol_filename_anion.append(np.nan)
        print('error (anion)- '+filename)

data['mol_filename_neutral']=mol_filename_neutral
data['mol_filename_anion']=mol_filename_anion
data

error (anion)- CN_R12
error (neutral)- CN_R28
error (anion)- CN_R28
error (neutral)- CN_R1268
error (anion)- CN_R1268
error (neutral)- NH2_R23
error (anion)- NH2_R23
error (neutral)- OH_R236


Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion
...,...,...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion
185,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion
186,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion
187,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion


In [8]:
data[data['mol_filename_neutral'].isna()]

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion
50,51,CN,2 8,-1.15,,
75,76,CN,1 2 6 8,-0.69,,
97,98,NH2,2 3,-2.04,,
150,151,OH,2 3 6,-1.85,,OH_R236_anion


In [9]:
data=data.dropna(subset=['mol_filename_neutral']).reset_index(drop=True)
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion
...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion


In [10]:
from rdkit import Chem
mols=[]
for index,row in data.iterrows():
    filename=row['mol_filename_neutral']
    mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
    mols.append(mol)
clear_output()
data['rdkit_mol']=mols
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion,rdkit_mol
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>
...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>


# Calculate Descriptors

## Basic Descriptors

In [11]:
# data_basic=data.copy()

In [12]:
# from sklearn.preprocessing import LabelEncoder
# # Removing: /home/vanka/anaconda2/share/jupyter/nbextensions/jupyter-js-widgets
# # https://github.com/jupyter-widgets/ipywidgets/issues/1146#issuecomment-287964101

In [13]:
from sklearn.preprocessing import LabelEncoder

data['FG_no_2d_basic']=LabelEncoder().fit_transform(data['FG_name'])
pos_dict={i+1:np.zeros(data.shape[0]) for i in range(9)}
for idx,row in data.iterrows():
    pos_string=row['FG Position']
#     print(row)
    if pos_string=='-':
        continue
    if pos_string=='27':
        pos_string='2 7'
    if pos_string=='28':
        pos_string='2 9'
    if pos_string=='146':
        pos_string='1 4 6'
    for pos in pos_string.split():
        pos_dict[int(pos)][idx]=1
for k,v in pos_dict.items():
    data[f'FG_position_{k}_2d_basic']=v
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,FG_position_5_2d_basic,FG_position_6_2d_basic,FG_position_7_2d_basic,FG_position_8_2d_basic,FG_position_9_2d_basic
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


In [14]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,FG_position_6_2d_basic,FG_position_7_2d_basic,FG_position_8_2d_basic,FG_position_9_2d_basic
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [15]:
# data_basic.to_csv('DATA_basic.csv',index=False)

# RDKit

<img src="https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid=4757&t=l" width=200px height=200px>

In [16]:
# data_rdkit=data.copy()

In [17]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from IPython.display import clear_output
from rdkit.Chem.Descriptors3D import *
from rdkit.Chem.rdMolDescriptors import *

mols=[]
for index,row in data.iterrows():
    filename=row['mol_filename_neutral']
    mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
    mols.append(mol)
clear_output()

rdkit_desc_2d_list=[(desc_name+'_2d_rdkit',desc_func) for desc_name,desc_func in Descriptors.descList]
rdkit_desc_3d_list=[('Asphericity_3d_rdkit',Asphericity),
            ('Eccentricity_3d_rdkit',Eccentricity),
            ('InertialShapeFactor_3d_rdkit',InertialShapeFactor),
            ('NPR1_3d_rdkit',NPR1),
            ('NPR2_3d_rdkit',NPR2),
            ('PMI1_3d_rdkit',PMI1),
            ('PMI2_3d_rdkit',PMI2),
            ('PMI3_3d_rdkit',PMI3),
            ('RadiusOfGyration_3d_rdkit',RadiusOfGyration),
            ('SpherocityIndex_3d_rdkit',SpherocityIndex),
           ]
rdkit_vec_desc_3d_list=[('Autocorr3D_3d_rdkit',CalcAUTOCORR3D),
                        ('RDF_3d_rdkit',CalcRDF),
                        ('MORSE_3d_rdkit',CalcMORSE),
                        ('WHIM_3d_rdkit',CalcWHIM),
                        ('GETAWAY_3d_rdkit',CalcGETAWAY)
                       ]

desc_list=rdkit_desc_2d_list+rdkit_desc_3d_list

for desc_name,desc_func in desc_list:
    desc_value_list=[]
    for idx,row in data.iterrows():
#         filename=row['mol_filename_neutral']
#         m=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
        desc_value_list.append(desc_func(row['rdkit_mol']))
    data[desc_name]=desc_value_list
    
for desc_name,desc_func in rdkit_vec_desc_3d_list:
    vec_list=[]
    for idx,row in data.iterrows():
        vec=desc_func(row['rdkit_mol'])
        vec_list.append(vec)
    vec_array=np.array(vec_list)
    for i in np.arange(vec_array.shape[1]):
        data[f'{desc_name}_{i}']=vec_array[:,i]
        
clear_output()
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,GETAWAY_3d_rdkit_263,GETAWAY_3d_rdkit_264,GETAWAY_3d_rdkit_265,GETAWAY_3d_rdkit_266,GETAWAY_3d_rdkit_267,GETAWAY_3d_rdkit_268,GETAWAY_3d_rdkit_269,GETAWAY_3d_rdkit_270,GETAWAY_3d_rdkit_271,GETAWAY_3d_rdkit_272
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,95.523,2.268,1.277,1.107,0.655,0.564,0.491,0.423,0.715,2.268
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,80.075,2.264,1.081,0.811,0.533,0.490,0.494,0.438,0.224,2.264
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,80.144,2.259,1.118,0.946,0.558,0.565,0.501,0.392,0.180,2.259
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,102.753,2.938,1.559,1.447,0.728,0.679,0.685,0.551,0.016,2.938
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,95.078,2.317,1.357,1.229,0.725,0.616,0.539,0.416,0.196,2.317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,293.011,2.053,8.363,1.268,1.200,3.194,1.012,0.833,0.362,8.363
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,300.798,1.865,5.524,1.362,0.929,0.278,0.359,1.951,3.113,5.524
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,291.371,2.053,5.734,1.052,0.879,5.253,0.361,0.434,0.417,5.734
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,386.683,1.620,4.211,1.107,0.767,2.823,1.580,1.453,2.320,4.211


In [18]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,GETAWAY_3d_rdkit_263,GETAWAY_3d_rdkit_264,GETAWAY_3d_rdkit_265,GETAWAY_3d_rdkit_266,GETAWAY_3d_rdkit_267,GETAWAY_3d_rdkit_268,GETAWAY_3d_rdkit_269,GETAWAY_3d_rdkit_270,GETAWAY_3d_rdkit_271,GETAWAY_3d_rdkit_272
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,95.523,2.268,1.277,1.107,0.655,0.564,0.491,0.423,0.715,2.268
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,80.075,2.264,1.081,0.811,0.533,0.490,0.494,0.438,0.224,2.264
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,80.144,2.259,1.118,0.946,0.558,0.565,0.501,0.392,0.180,2.259
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,102.753,2.938,1.559,1.447,0.728,0.679,0.685,0.551,0.016,2.938
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,95.078,2.317,1.357,1.229,0.725,0.616,0.539,0.416,0.196,2.317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,293.011,2.053,8.363,1.268,1.200,3.194,1.012,0.833,0.362,8.363
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,300.798,1.865,5.524,1.362,0.929,0.278,0.359,1.951,3.113,5.524
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,291.371,2.053,5.734,1.052,0.879,5.253,0.361,0.434,0.417,5.734
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,386.683,1.620,4.211,1.107,0.767,2.823,1.580,1.453,2.320,4.211


In [19]:
# data.to_csv('data.csv',index=False)

# Mordred

In [20]:
# data_mordred=data.copy()

In [21]:
from collections import defaultdict
from mordred import Calculator, descriptors

mols=[]
for index,row in data.iterrows():
    filename=row['mol_filename_neutral']
    mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
    mols.append(mol)
clear_output()

calc_2d = Calculator(descriptors, ignore_3D=True)
calc_2d_3d = Calculator(descriptors, ignore_3D=False)
mordred_desc_2d_3d_list=list(map(str,calc_2d_3d.descriptors))
mordred_desc_2d_list=list(map(str,calc_2d.descriptors))
mordred_desc_3d_list=list(set(mordred_desc_2d_3d_list)-set(mordred_desc_2d_list))

desc_dict=defaultdict(list)
for idx,row in tqdm(data.iterrows()):
    filename=row['mol_filename_neutral']
    mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
    d=calc_2d_3d(mol)
    for k,v in d.fill_missing().items():
        if str(k) in mordred_desc_2d_list:
            desc_dict[str(k)+'_2d_mordred'].append(v)
        elif str(k) in mordred_desc_3d_list:
            desc_dict[str(k)+'_3d_mordred'].append(v)
        else:
            print('Error')

for k,v in desc_dict.items():
    if np.any(np.isnan(v)):
        continue
    else:
        data[k]=v
        
clear_output()        
len(mols)
print(len(mordred_desc_2d_3d_list))
print(len(mordred_desc_2d_list))
print(len(mordred_desc_3d_list))    
data

1826
1613
213


Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,SRW10_2d_mordred,TSRW10_2d_mordred,MW_2d_mordred,AMW_2d_mordred,WPath_2d_mordred,WPol_2d_mordred,Zagreb1_2d_mordred,Zagreb2_2d_mordred,mZagreb1_2d_mordred,mZagreb2_2d_mordred
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,9.657139,46.247824,180.068748,8.184943,279,21,76.0,90.0,2.944444,3.055556
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,9.903937,50.295123,223.110947,7.437032,476,28,92.0,110.0,4.916667,3.694444
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,9.860006,50.217934,223.110947,7.437032,500,27,92.0,109.0,4.916667,3.666667
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,9.783859,47.718582,195.079647,8.128319,334,24,82.0,98.0,3.805556,3.250000
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,9.748295,47.650681,195.079647,8.128319,342,23,82.0,97.0,3.805556,3.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,10.416820,61.544380,360.009061,12.000302,1529,49,140.0,170.0,10.833333,5.611111
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,10.410817,61.535641,360.009061,12.000302,1463,49,140.0,170.0,10.833333,5.611111
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,10.362715,61.448683,360.009061,12.000302,1595,47,140.0,168.0,10.833333,5.555556
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,10.682560,68.670486,449.979218,13.234683,2436,64,172.0,211.0,14.777778,6.916667


In [22]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,SRW10_2d_mordred,TSRW10_2d_mordred,MW_2d_mordred,AMW_2d_mordred,WPath_2d_mordred,WPol_2d_mordred,Zagreb1_2d_mordred,Zagreb2_2d_mordred,mZagreb1_2d_mordred,mZagreb2_2d_mordred
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,9.657139,46.247824,180.068748,8.184943,279,21,76.0,90.0,2.944444,3.055556
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,9.903937,50.295123,223.110947,7.437032,476,28,92.0,110.0,4.916667,3.694444
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,9.860006,50.217934,223.110947,7.437032,500,27,92.0,109.0,4.916667,3.666667
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,9.783859,47.718582,195.079647,8.128319,334,24,82.0,98.0,3.805556,3.250000
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,9.748295,47.650681,195.079647,8.128319,342,23,82.0,97.0,3.805556,3.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,10.416820,61.544380,360.009061,12.000302,1529,49,140.0,170.0,10.833333,5.611111
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,10.410817,61.535641,360.009061,12.000302,1463,49,140.0,170.0,10.833333,5.611111
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,10.362715,61.448683,360.009061,12.000302,1595,47,140.0,168.0,10.833333,5.555556
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,10.682560,68.670486,449.979218,13.234683,2436,64,172.0,211.0,14.777778,6.916667


In [23]:
# from mordred import Calculator, descriptors

# calc = Calculator(descriptors, ignore_3D=True)


# filename=data.iloc[1,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
# len(calc(mol))
# # calc.pandas(mol)
# # clear_output()

# calc = Calculator(descriptors, ignore_3D=False)
# for index,row in data.iterrows():
#     filename=row['mol_filename_neutral']
#     print(filename)
#     mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
#     d=calc(mol)
#     print(len(d))

# calc.descriptors[0]

# d[-1]

# calc2d = Calculator(descriptors, ignore_3D=True)
# calc3d = Calculator(descriptors, ignore_3D=False)

# set(calc3d.descriptors)-set(calc2d.descriptors)

# from rdkit.Chem import Descriptors3D
# from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

# descList=[d[0] for d in Descriptors.descList ]
# calc=MolecularDescriptorCalculator(descList)
# calc.CalcDescriptors(m)
# #
# list(rdMolDescriptors.Properties().GetPropertyNames())

# Fingerprints

In [24]:
import deepchem as dc

In [25]:
featurizer = dc.feat.CircularFingerprint(radius=4,size=1024)
ecfp = featurizer.featurize(mols)
for j in range(ecfp.shape[1]):
    data[f'ecfp4_fp_{j}']=ecfp[:,j]
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,ecfp4_fp_1014,ecfp4_fp_1015,ecfp4_fp_1016,ecfp4_fp_1017,ecfp4_fp_1018,ecfp4_fp_1019,ecfp4_fp_1020,ecfp4_fp_1021,ecfp4_fp_1022,ecfp4_fp_1023
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
featurizer = dc.feat.MACCSKeysFingerprint()
maccs_keys = featurizer.featurize(mols)
for j in range(maccs_keys.shape[1]):
    data[f'maccs_keys_fp_{j}']=maccs_keys[:,j]
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,maccs_keys_fp_157,maccs_keys_fp_158,maccs_keys_fp_159,maccs_keys_fp_160,maccs_keys_fp_161,maccs_keys_fp_162,maccs_keys_fp_163,maccs_keys_fp_164,maccs_keys_fp_165,maccs_keys_fp_166
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,0,0,0,0,1,1,1,0,1,0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,0,1,0,1,1,1,1,0,1,0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,0,1,0,1,1,1,1,0,1,0
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,0,1,0,0,1,1,1,0,1,0
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,0,1,0,0,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,0,1,1,0,1,1,1,1,1,0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,0,1,1,0,1,1,1,1,1,0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,0,1,1,0,1,1,1,1,1,0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,0,1,1,0,1,1,1,1,1,0


In [27]:
rdkit_fingerprint=np.array([Chem.RDKFingerprint(mol) for mol in mols])
for j in range(rdkit_fingerprint.shape[1]):
    data[f'rdkit_fp_{j}']=rdkit_fingerprint[:,j]
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,rdkit_fp_2038,rdkit_fp_2039,rdkit_fp_2040,rdkit_fp_2041,rdkit_fp_2042,rdkit_fp_2043,rdkit_fp_2044,rdkit_fp_2045,rdkit_fp_2046,rdkit_fp_2047
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,0,0,0,1,1,0,1,0,0,1
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,0,0,0,1,0,0,0,0,0,1
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,0,0,0,1,1,0,1,0,0,0
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,0,0,0,1,1,0,1,0,0,0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,1,0,0,1,1,0,1,0,0,0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,0,0,0,1,0,0,0,0,0,0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,1,0,0,1,1,0,1,0,0,0


In [28]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,rdkit_mol,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,rdkit_fp_2035,rdkit_fp_2036,rdkit_fp_2037,rdkit_fp_2038,rdkit_fp_2039,rdkit_fp_2042,rdkit_fp_2043,rdkit_fp_2044,rdkit_fp_2046,rdkit_fp_2047
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c300>,0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a95f3f0>,1,1.0,0.0,0.0,...,0,0,1,0,0,1,0,1,0,1
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c3a0>,1,0.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,1
3,3,NH2,1,-1.85,NH2_R1_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a9413a0>,14,1.0,0.0,0.0,...,0,0,1,0,0,1,0,1,0,0
4,4,NH2,2,-1.92,NH2_R2_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a50c350>,14,0.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514ad0>,15,1.0,1.0,0.0,...,0,0,1,0,0,1,0,1,0,0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b20>,15,1.0,0.0,0.0,...,1,0,1,1,0,1,0,1,0,0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514b70>,15,0.0,1.0,1.0,...,0,0,1,0,0,0,0,0,0,0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,<rdkit.Chem.rdchem.Mol object at 0x7f561a514bc0>,15,1.0,1.0,1.0,...,1,0,1,1,0,1,0,1,0,0


In [29]:
print('basic')
print(len([col for col in data.columns if '2d_basic' in col]))
print('\nrdkit')
print(len([col for col in data.columns if '2d_rdkit' in col]))
print(len([col for col in data.columns if '3d_rdkit' in col]))
print('\nmordred')
print(len([col for col in data.columns if '2d_mordred' in col]))
print(len([col for col in data.columns if '3d_mordred' in col]))
print('\nfingerprint')
print(len([col for col in data.columns if 'ecfp4_fp' in col]))
print(len([col for col in data.columns if 'maccs_keys_fp' in col]))
print(len([col for col in data.columns if 'rdkit_fp' in col]))

basic
9

rdkit
142
869

mordred
1153
56

fingerprint
792
67
1631


In [30]:
import pickle

with open('DATA.pkl','wb') as outfile:
    pickle.dump(data,outfile)

In [31]:
# from rdkit.Chem.AtomPairs import Pairs

# filename=data.iloc[1,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
# len(list((Pairs.GetAtomPairFingerprint(mol))))

In [32]:
# from rdkit.Chem.AtomPairs import Pairs

# filename=data.iloc[70,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
# len(list((Pairs.GetAtomPairFingerprint(mol))))

In [33]:
# from rdkit.Chem.AtomPairs import Pairs

# filename=data.iloc[70,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
# len(list((Chem.RDKFingerprint(mol))))

In [34]:
# filename=data.iloc[1,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'DATA_molecules/neutral/{filename}.mol')
# featurizer = dc.feat.CircularFingerprint(size=1024)
# ecfp = featurizer.featurize(mols)
# ecfp.shape