In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Extract Data

In [2]:
import tabula
supp_file_path='d0se00687d1.pdf'
data = tabula.read_pdf(supp_file_path, pages = "7-10",multiple_datas = False)
data.columns=['Sr. No.','FG','FG Position', 'Redox Potential']

def mapFG(x):
    if x=='N(CH3)2':
        return '2CH3N'
    else:
        return x
data['FG']=data['FG'].map(mapFG)
data.rename(columns={'FG':'FG_name'},inplace=True)
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential
0,Phenazine,-,-,-1.74
1,1,2CH3N,1,-1.85
2,2,2CH3N,2,-1.98
3,3,NH2,1,-1.85
4,4,NH2,2,-1.92
...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35
185,186,NO2,1 4 6 9,-0.52
186,187,NO2,2 3 7 8,-0.52
187,188,NO2,1 2 3 4 6 9,0.16


In [3]:
cords_file_path='SI_coordinates_vs1_20200617.txt'
lines=open(cords_file_path,'r').readlines()

mol_dict_xyz={}
key=None
for line in lines:
    if 'neutral' in line.lower() or 'anion' in line.lower():
        key='_'.join(line.strip().split())
        mol_dict_xyz[key]=[]
    elif line=='\n':
        continue
    else:
        mol_dict_xyz[key].append('\t'.join(line.split()))

for key,value in mol_dict_xyz.items():
    mol_dict_xyz[key]=str(len(value))+'\n'+key+'\n'+'\n'.join(value)

print(f'Total Molecules Found = {len(mol_dict_xyz)}')
print('Key = '+next(iter(mol_dict_xyz.keys())))
mol_dict_xyz['PHENAZINE_neutral']

Total Molecules Found = 370
Key = PHENAZINE_neutral


'22\nPHENAZINE_neutral\nN\t0.00000\t1.42349\t0.00005\nN\t0.00000\t-1.42349\t-0.00005\nC\t1.14528\t0.72406\t-0.00000\nC\t1.14528\t-0.72406\t-0.00006\nC\t-1.14528\t-0.72406\t0.00000\nC\t-1.14528\t0.72406\t0.00006\nC\t2.39571\t-1.41739\t-0.00011\nC\t2.39571\t1.41739\t-0.00001\nC\t-2.39571\t-1.41739\t0.00001\nC\t-2.39571\t1.41739\t0.00011\nC\t3.57334\t0.71483\t-0.00006\nC\t3.57334\t-0.71483\t-0.00012\nC\t-3.57334\t0.71483\t0.00012\nC\t-3.57334\t-0.71483\t0.00006\nH\t2.37152\t-2.50248\t-0.00015\nH\t2.37152\t2.50248\t0.00003\nH\t-2.37152\t-2.50248\t-0.00003\nH\t-2.37152\t2.50248\t0.00015\nH\t4.52148\t-1.24448\t-0.00016\nH\t4.52148\t1.24448\t-0.00007\nH\t-4.52148\t-1.24448\t0.00007\nH\t-4.52148\t1.24448\t0.00016'

In [4]:
from openbabel import pybel

output_dir='molecules'
for k,v in mol_dict_xyz.items():
    mol=pybel.readstring('xyz',v)
    if 'neutral' in k.lower():
        mol.write('mol', f'{output_dir}/neutral/{k}.mol',overwrite=True)
    elif 'anion' in k.lower():
        mol.write('mol', f'{output_dir}/anion/{k}.mol',overwrite=True)

In [5]:
mol_filenames=data['FG_name']+data['FG Position'].apply(lambda x:'_R'+''.join(x.split()))
mol_filenames[0]='Phenazine'
mol_filename_neutral=[]
mol_filename_anion=[]

for filename in mol_filenames:
    neutral_key=None
    anion_key=None
    for key in mol_dict_xyz.keys():
        if (filename+'_neutral').lower() == key.lower():
            neutral_key=key
            mol_filename_neutral.append(neutral_key)
            break
    if neutral_key is None:
        mol_filename_neutral.append(np.nan)
        print('error (neutral)- '+filename)
    for key in mol_dict_xyz.keys():
        if (filename+'_anion').lower() == key.lower():
            anion_key=key
            mol_filename_anion.append(anion_key)
            break
    if anion_key is None:
        mol_filename_anion.append(np.nan)
        print('error (anion)- '+filename)

data['mol_filename_neutral']=mol_filename_neutral
data['mol_filename_anion']=mol_filename_anion
data

error (anion)- CN_R12
error (neutral)- CN_R28
error (anion)- CN_R28
error (neutral)- CN_R1268
error (anion)- CN_R1268
error (neutral)- NH2_R23
error (anion)- NH2_R23
error (neutral)- OH_R236


Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion
...,...,...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion
185,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion
186,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion
187,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion


In [6]:
data[data['mol_filename_neutral'].isna()]

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion
50,51,CN,2 8,-1.15,,
75,76,CN,1 2 6 8,-0.69,,
97,98,NH2,2 3,-2.04,,
150,151,OH,2 3 6,-1.85,,OH_R236_anion


In [7]:
data=data.dropna(subset=['mol_filename_neutral']).reset_index(drop=True)
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion
...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion


# Calculate Descriptors

## Basic Descriptors

In [9]:
# data_basic=data.copy()

In [11]:
from sklearn.preprocessing import LabelEncoder

data['FG_no_2d_basic']=LabelEncoder().fit_transform(data['FG_name'])
pos_dict={i+1:np.zeros(data.shape[0]) for i in range(9)}
for idx,row in data.iterrows():
    pos_string=row['FG Position']
#     print(row)
    if pos_string=='-':
        continue
    if pos_string=='27':
        pos_string='2 7'
    if pos_string=='28':
        pos_string='2 9'
    if pos_string=='146':
        pos_string='1 4 6'
    for pos in pos_string.split():
        pos_dict[int(pos)][idx]=1
for k,v in pos_dict.items():
    data[f'FG_position_{k}_2d_basic']=v
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,FG_position_5_2d_basic,FG_position_6_2d_basic,FG_position_7_2d_basic,FG_position_8_2d_basic,FG_position_9_2d_basic
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion,14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion,14,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion,15,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion,15,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion,15,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion,15,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


In [13]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,FG_position_6_2d_basic,FG_position_7_2d_basic,FG_position_8_2d_basic,FG_position_9_2d_basic
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [14]:
# data_basic.to_csv('DATA_basic.csv',index=False)

# RDKit

<img src="https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid=4757&t=l" width=200px height=200px>

In [12]:
# data_rdkit=data.copy()

In [14]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from IPython.display import clear_output
from rdkit.Chem.Descriptors3D import *

rdkit_desc_2d_list=[(desc_name+'_2d_rdkit',desc_func) for desc_name,desc_func in Descriptors.descList]
rdkit_desc_3d_list=[('Asphericity_3d_rdkit',Asphericity),
            ('Eccentricity_3d_rdkit',Eccentricity),
            ('InertialShapeFactor_3d_rdkit',InertialShapeFactor),
            ('NPR1_3d_rdkit',NPR1),
            ('NPR2_3d_rdkit',NPR2),
            ('PMI1_3d_rdkit',PMI1),
            ('PMI2_3d_rdkit',PMI2),
            ('PMI3_3d_rdkit',PMI3),
            ('RadiusOfGyration_3d_rdkit',RadiusOfGyration),
            ('SpherocityIndex_3d_rdkit',SpherocityIndex)
           ]

desc_list=rdkit_desc_2d_list+rdkit_desc_3d_list

for desc_name,desc_func in desc_list:
    desc_value_list=[]
    for row in data.iterrows():
        filename=row[1]['mol_filename_neutral']
        m=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
        desc_value_list.append(desc_func(m))
    data[desc_name]=desc_value_list
clear_output()

In [15]:
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,...,Asphericity_3d_rdkit,Eccentricity_3d_rdkit,InertialShapeFactor_3d_rdkit,NPR1_3d_rdkit,NPR2_3d_rdkit,PMI1_3d_rdkit,PMI2_3d_rdkit,PMI3_3d_rdkit,RadiusOfGyration_3d_rdkit,SpherocityIndex_3d_rdkit
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion,0,0.0,0.0,0.0,...,0.565426,0.984436,0.004060,0.175744,0.824256,203.025746,952.211108,1155.236854,2.590521,3.469194e-10
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion,1,1.0,0.0,0.0,...,0.414850,0.964725,0.001641,0.263258,0.754145,459.636664,1316.701291,1745.953265,2.894724,2.616881e-02
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion,1,0.0,1.0,0.0,...,0.652689,0.991067,0.003116,0.133363,0.880800,282.693153,1867.057219,2119.729166,3.186999,2.147342e-02
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion,14,1.0,0.0,0.0,...,0.480865,0.974914,0.002627,0.222582,0.777534,296.015256,1034.054046,1329.914576,2.672942,1.638895e-04
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion,14,0.0,1.0,0.0,...,0.620758,0.988920,0.003874,0.148451,0.851582,219.819135,1260.983509,1480.753866,2.820395,4.615913e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion,15,1.0,1.0,0.0,...,0.573428,0.985310,0.000926,0.170773,0.857589,925.652115,4648.439587,5420.359453,3.928670,3.827094e-02
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion,15,1.0,0.0,0.0,...,0.215618,0.852354,0.000234,0.522965,0.525738,2250.280048,2262.214379,4302.929844,3.517875,6.432619e-02
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion,15,0.0,1.0,1.0,...,0.654730,0.991202,0.001018,0.132357,0.891029,874.926498,5890.018831,6610.353674,4.333218,3.206787e-02
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion,15,1.0,1.0,1.0,...,0.258234,0.916344,0.000251,0.400392,0.652081,2601.278722,4236.464502,6496.833550,3.857006,7.038922e-02


In [16]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,...,Asphericity_3d_rdkit,Eccentricity_3d_rdkit,InertialShapeFactor_3d_rdkit,NPR1_3d_rdkit,NPR2_3d_rdkit,PMI1_3d_rdkit,PMI2_3d_rdkit,PMI3_3d_rdkit,RadiusOfGyration_3d_rdkit,SpherocityIndex_3d_rdkit
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,...,0.565426,0.984436,0.004060,0.175744,0.824256,203.025746,952.211108,1155.236854,2.590521,3.469194e-10
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,...,0.414850,0.964725,0.001641,0.263258,0.754145,459.636664,1316.701291,1745.953265,2.894724,2.616881e-02
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,...,0.652689,0.991067,0.003116,0.133363,0.880800,282.693153,1867.057219,2119.729166,3.186999,2.147342e-02
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,...,0.480865,0.974914,0.002627,0.222582,0.777534,296.015256,1034.054046,1329.914576,2.672942,1.638895e-04
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,...,0.620758,0.988920,0.003874,0.148451,0.851582,219.819135,1260.983509,1480.753866,2.820395,4.615913e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,...,0.573428,0.985310,0.000926,0.170773,0.857589,925.652115,4648.439587,5420.359453,3.928670,3.827094e-02
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,...,0.215618,0.852354,0.000234,0.522965,0.525738,2250.280048,2262.214379,4302.929844,3.517875,6.432619e-02
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,...,0.654730,0.991202,0.001018,0.132357,0.891029,874.926498,5890.018831,6610.353674,4.333218,3.206787e-02
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,...,0.258234,0.916344,0.000251,0.400392,0.652081,2601.278722,4236.464502,6496.833550,3.857006,7.038922e-02


In [17]:
# data.to_csv('data.csv',index=False)

# Mordred

In [18]:
# data_mordred=data.copy()

1826
1613
213


In [21]:
from collections import defaultdict
from mordred import Calculator, descriptors

mols=[]
for index,row in data.iterrows():
    filename=row['mol_filename_neutral']
    mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
    mols.append(mol)
clear_output()
len(mols)

calc_2d = Calculator(descriptors, ignore_3D=True)
calc_2d_3d = Calculator(descriptors, ignore_3D=False)
mordred_desc_2d_3d_list=list(map(str,calc_2d_3d.descriptors))
mordred_desc_2d_list=list(map(str,calc_2d.descriptors))
mordred_desc_3d_list=list(set(mordred_desc_2d_3d_list)-set(mordred_desc_2d_list))
print(len(mordred_desc_2d_3d_list))
print(len(mordred_desc_2d_list))
print(len(mordred_desc_3d_list))

# calc = Calculator(descriptors, ignore_3D=False)
desc_dict=defaultdict(list)
for idx,row in tqdm(data.iterrows()):
    filename=row['mol_filename_neutral']
    mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
    d=calc_2d_3d(mol)
    for k,v in d.fill_missing().items():
        if str(k) in mordred_desc_2d_list:
            desc_dict[str(k)+'_2d_mordred'].append(v)
        elif str(k) in mordred_desc_3d_list:
            desc_dict[str(k)+'_3d_mordred'].append(v)
        else:
            print('Error')

clear_output()

for k,v in desc_dict.items():
    if np.any(np.isnan(v)):
        continue
    else:
        data[k]=v
    
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,...,SRW10_2d_mordred,TSRW10_2d_mordred,MW_2d_mordred,AMW_2d_mordred,WPath_2d_mordred,WPol_2d_mordred,Zagreb1_2d_mordred,Zagreb2_2d_mordred,mZagreb1_2d_mordred,mZagreb2_2d_mordred
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,...,9.657139,46.247824,180.068748,8.184943,279,21,76.0,90.0,2.944444,3.055556
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,...,9.903937,50.295123,223.110947,7.437032,476,28,92.0,110.0,4.916667,3.694444
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,...,9.860006,50.217934,223.110947,7.437032,500,27,92.0,109.0,4.916667,3.666667
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,...,9.783859,47.718582,195.079647,8.128319,334,24,82.0,98.0,3.805556,3.250000
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,...,9.748295,47.650681,195.079647,8.128319,342,23,82.0,97.0,3.805556,3.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,...,10.416820,61.544380,360.009061,12.000302,1529,49,140.0,170.0,10.833333,5.611111
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,...,10.410817,61.535641,360.009061,12.000302,1463,49,140.0,170.0,10.833333,5.611111
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,...,10.362715,61.448683,360.009061,12.000302,1595,47,140.0,168.0,10.833333,5.555556
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,...,10.682560,68.670486,449.979218,13.234683,2436,64,172.0,211.0,14.777778,6.916667


In [22]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,...,SRW10_2d_mordred,TSRW10_2d_mordred,MW_2d_mordred,AMW_2d_mordred,WPath_2d_mordred,WPol_2d_mordred,Zagreb1_2d_mordred,Zagreb2_2d_mordred,mZagreb1_2d_mordred,mZagreb2_2d_mordred
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,...,9.657139,46.247824,180.068748,8.184943,279,21,76.0,90.0,2.944444,3.055556
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,...,9.903937,50.295123,223.110947,7.437032,476,28,92.0,110.0,4.916667,3.694444
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,...,9.860006,50.217934,223.110947,7.437032,500,27,92.0,109.0,4.916667,3.666667
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,...,9.783859,47.718582,195.079647,8.128319,334,24,82.0,98.0,3.805556,3.250000
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,...,9.748295,47.650681,195.079647,8.128319,342,23,82.0,97.0,3.805556,3.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,...,10.416820,61.544380,360.009061,12.000302,1529,49,140.0,170.0,10.833333,5.611111
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,...,10.410817,61.535641,360.009061,12.000302,1463,49,140.0,170.0,10.833333,5.611111
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,...,10.362715,61.448683,360.009061,12.000302,1595,47,140.0,168.0,10.833333,5.555556
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,...,10.682560,68.670486,449.979218,13.234683,2436,64,172.0,211.0,14.777778,6.916667


In [23]:
# # from rdkit.Chem.Descriptors3D import *

# # Autocorr3D

# import rdkit
# from rdkit.Chem import Descriptors3D

# help(rdkit.Chem.Descriptors3D)

In [24]:
# from mordred import Calculator, descriptors

# calc = Calculator(descriptors, ignore_3D=True)


# filename=data.iloc[1,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
# len(calc(mol))
# # calc.pandas(mol)
# # clear_output()

# calc = Calculator(descriptors, ignore_3D=False)
# for index,row in data.iterrows():
#     filename=row['mol_filename_neutral']
#     print(filename)
#     mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
#     d=calc(mol)
#     print(len(d))

# calc.descriptors[0]

# d[-1]

# calc2d = Calculator(descriptors, ignore_3D=True)
# calc3d = Calculator(descriptors, ignore_3D=False)

# set(calc3d.descriptors)-set(calc2d.descriptors)

# from rdkit.Chem import Descriptors3D
# from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

# descList=[d[0] for d in Descriptors.descList ]
# calc=MolecularDescriptorCalculator(descList)
# calc.CalcDescriptors(m)
# #
# list(rdMolDescriptors.Properties().GetPropertyNames())

# Fingerprints

In [25]:
import deepchem as dc

In [40]:
featurizer = dc.feat.CircularFingerprint(radius=4,size=1024)
ecfp = featurizer.featurize(mols)
for j in range(ecfp.shape[1]):
    data[f'ecfp4_fp_{j}']=ecfp[:,j]
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,...,ecfp4_fp_1014,ecfp4_fp_1015,ecfp4_fp_1016,ecfp4_fp_1017,ecfp4_fp_1018,ecfp4_fp_1019,ecfp4_fp_1020,ecfp4_fp_1021,ecfp4_fp_1022,ecfp4_fp_1023
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
featurizer = dc.feat.MACCSKeysFingerprint()
maccs_keys = featurizer.featurize(mols)
for j in range(maccs_keys.shape[1]):
    data[f'maccs_keys_fp_{j}']=maccs_keys[:,j]
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,...,maccs_keys_fp_157,maccs_keys_fp_158,maccs_keys_fp_159,maccs_keys_fp_160,maccs_keys_fp_161,maccs_keys_fp_162,maccs_keys_fp_163,maccs_keys_fp_164,maccs_keys_fp_165,maccs_keys_fp_166
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,1,0,1,0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,...,0,1,0,1,1,1,1,0,1,0
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,...,0,1,0,1,1,1,1,0,1,0
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,...,0,1,0,0,1,1,1,0,1,0
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,...,0,1,0,0,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,...,0,1,1,0,1,1,1,1,1,0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,...,0,1,1,0,1,1,1,1,1,0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,...,0,1,1,0,1,1,1,1,1,0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,...,0,1,1,0,1,1,1,1,1,0


In [28]:
rdkit_fingerprint=np.array([Chem.RDKFingerprint(mol) for mol in mols])
for j in range(rdkit_fingerprint.shape[1]):
    data[f'rdkit_fp_{j}']=rdkit_fingerprint[:,j]
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,...,rdkit_fp_2038,rdkit_fp_2039,rdkit_fp_2040,rdkit_fp_2041,rdkit_fp_2042,rdkit_fp_2043,rdkit_fp_2044,rdkit_fp_2045,rdkit_fp_2046,rdkit_fp_2047
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,0,1
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,1
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,...,0,0,0,1,1,0,1,0,0,0
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,...,0,0,0,1,1,0,1,0,0,0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,...,1,0,0,1,1,0,1,0,0,0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,...,0,0,0,1,0,0,0,0,0,0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,...,1,0,0,1,1,0,1,0,0,0


In [29]:
data=data.dropna(axis=1)
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,FG_no_2d_basic,FG_position_1_2d_basic,FG_position_2_2d_basic,FG_position_3_2d_basic,FG_position_4_2d_basic,...,rdkit_fp_2035,rdkit_fp_2036,rdkit_fp_2037,rdkit_fp_2038,rdkit_fp_2039,rdkit_fp_2042,rdkit_fp_2043,rdkit_fp_2044,rdkit_fp_2046,rdkit_fp_2047
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,1,1.0,0.0,0.0,0.0,...,0,0,1,0,0,1,0,1,0,1
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,1,0.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1
3,3,NH2,1,-1.85,NH2_R1_neutral,14,1.0,0.0,0.0,0.0,...,0,0,1,0,0,1,0,1,0,0
4,4,NH2,2,-1.92,NH2_R2_neutral,14,0.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,15,1.0,1.0,0.0,0.0,...,0,0,1,0,0,1,0,1,0,0
181,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,15,1.0,0.0,0.0,1.0,...,1,0,1,1,0,1,0,1,0,0
182,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,15,0.0,1.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0
183,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,15,1.0,1.0,1.0,1.0,...,1,0,1,1,0,1,0,1,0,0


In [37]:
print('basic')
print(len([col for col in data.columns if '2d_basic' in col]))
print('\nrdkit')
print(len([col for col in data.columns if '2d_rdkit' in col]))
print(len([col for col in data.columns if '3d_rdkit' in col]))
print('\nmordred')
print(len([col for col in data.columns if '2d_mordred' in col]))
print(len([col for col in data.columns if '3d_mordred' in col]))
print('\nfingerprint')
print(len([col for col in data.columns if 'ecfp_fp' in col]))
print(len([col for col in data.columns if 'maccs_keys_fp' in col]))
print(len([col for col in data.columns if 'rdkit_fp' in col]))

basic
9

rdkit
134
10

mordred
1153
56

fingerprint
262
67
1631


In [39]:
data.to_csv('DATA.csv',index=False)

In [32]:
# from rdkit.Chem.AtomPairs import Pairs

# filename=data.iloc[1,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
# len(list((Pairs.GetAtomPairFingerprint(mol))))

In [33]:
# from rdkit.Chem.AtomPairs import Pairs

# filename=data.iloc[70,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
# len(list((Pairs.GetAtomPairFingerprint(mol))))

In [34]:
# from rdkit.Chem.AtomPairs import Pairs

# filename=data.iloc[70,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
# len(list((Chem.RDKFingerprint(mol))))

In [35]:
# filename=data.iloc[1,:]['mol_filename_neutral']
# mol=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
# featurizer = dc.feat.CircularFingerprint(size=1024)
# ecfp = featurizer.featurize(mols)
# ecfp.shape

In [36]:
!curl www.google.com

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Notification: Proxy Authorization Required</title>
<style type="text/css">
body {
  font-family: Arial, Helvetica, sans-serif;
  font-size: 14px;
  color:#333333;
  background-color: #ffffff;
}
h1 {
  font-size: 18px;
  font-weight: bold;
  text-decoration: none;
  padding-top: 0px;
  color: #2970A6;
}
a:link {
    color: #2970A6;
  text-decoration: none;
}
a:hover {
    color: #2970A6;
  text-decoration: underline;
}
p.buttonlink {
  margin-bottom: 24px;
}
.copyright {
  font-size: 12px;
  color: #666666;
  margin: 5px 5px 0px 30px;

}
.details {
  font-size: 14px;
  color: #969696;
  border: none;
  padding: 20px 20px 20px 20px;
  margin: 0px 10px 10px 35px;
}

.shadow {
  border: 3px solid #9f9f9f;
  padding: 10px 25px 10px 25px;
  margin: 10px 35