In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Extract Data

In [27]:
import tabula
supp_file_path='d0se00687d1.pdf'
data = tabula.read_pdf(supp_file_path, pages = "7-10",multiple_datas = False)
data.columns=['Sr. No.','FG','FG Position', 'Redox Potential']

def mapFG(x):
    if x=='N(CH3)2':
        return '2CH3N'
    else:
        return x
data['FG']=data['FG'].map(mapFG)
data.rename(columns={'FG':'FG_name'},inplace=True)
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential
0,Phenazine,-,-,-1.74
1,1,2CH3N,1,-1.85
2,2,2CH3N,2,-1.98
3,3,NH2,1,-1.85
4,4,NH2,2,-1.92
...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35
185,186,NO2,1 4 6 9,-0.52
186,187,NO2,2 3 7 8,-0.52
187,188,NO2,1 2 3 4 6 9,0.16


In [28]:
cords_file_path='SI_coordinates_vs1_20200617.txt'
lines=open(cords_file_path,'r').readlines()

mol_dict_xyz={}
key=None
for line in lines:
    if 'neutral' in line.lower() or 'anion' in line.lower():
        key='_'.join(line.strip().split())
        mol_dict_xyz[key]=[]
    elif line=='\n':
        continue
    else:
        mol_dict_xyz[key].append('\t'.join(line.split()))

for key,value in mol_dict_xyz.items():
    mol_dict_xyz[key]=str(len(value))+'\n'+key+'\n'+'\n'.join(value)

print(f'Total Molecules Found = {len(mol_dict_xyz)}')
print('Key = '+next(iter(mol_dict_xyz.keys())))
mol_dict_xyz['PHENAZINE_neutral']

Total Molecules Found = 370
Key = PHENAZINE_neutral


'22\nPHENAZINE_neutral\nN\t0.00000\t1.42349\t0.00005\nN\t0.00000\t-1.42349\t-0.00005\nC\t1.14528\t0.72406\t-0.00000\nC\t1.14528\t-0.72406\t-0.00006\nC\t-1.14528\t-0.72406\t0.00000\nC\t-1.14528\t0.72406\t0.00006\nC\t2.39571\t-1.41739\t-0.00011\nC\t2.39571\t1.41739\t-0.00001\nC\t-2.39571\t-1.41739\t0.00001\nC\t-2.39571\t1.41739\t0.00011\nC\t3.57334\t0.71483\t-0.00006\nC\t3.57334\t-0.71483\t-0.00012\nC\t-3.57334\t0.71483\t0.00012\nC\t-3.57334\t-0.71483\t0.00006\nH\t2.37152\t-2.50248\t-0.00015\nH\t2.37152\t2.50248\t0.00003\nH\t-2.37152\t-2.50248\t-0.00003\nH\t-2.37152\t2.50248\t0.00015\nH\t4.52148\t-1.24448\t-0.00016\nH\t4.52148\t1.24448\t-0.00007\nH\t-4.52148\t-1.24448\t0.00007\nH\t-4.52148\t1.24448\t0.00016'

In [29]:
from openbabel import pybel

output_dir='molecules'
for k,v in mol_dict_xyz.items():
    mol=pybel.readstring('xyz',v)
    if 'neutral' in k.lower():
        mol.write('mol', f'{output_dir}/neutral/{k}.mol',overwrite=True)
    elif 'anion' in k.lower():
        mol.write('mol', f'{output_dir}/anion/{k}.mol',overwrite=True)

In [30]:
mol_filenames=data['FG_name']+data['FG Position'].apply(lambda x:'_R'+''.join(x.split()))
mol_filenames[0]='Phenazine'
mol_filename_neutral=[]
mol_filename_anion=[]

for filename in mol_filenames:
    neutral_key=None
    anion_key=None
    for key in mol_dict_xyz.keys():
        if (filename+'_neutral').lower() == key.lower():
            neutral_key=key
            mol_filename_neutral.append(neutral_key)
            break
    if neutral_key is None:
        mol_filename_neutral.append(np.nan)
        print('error (neutral)- '+filename)
    for key in mol_dict_xyz.keys():
        if (filename+'_anion').lower() == key.lower():
            anion_key=key
            mol_filename_anion.append(anion_key)
            break
    if anion_key is None:
        mol_filename_anion.append(np.nan)
        print('error (anion)- '+filename)

data['mol_filename_neutral']=mol_filename_neutral
data['mol_filename_anion']=mol_filename_anion
data

error (anion)- CN_R12
error (neutral)- CN_R28
error (anion)- CN_R28
error (neutral)- CN_R1268
error (anion)- CN_R1268
error (neutral)- NH2_R23
error (anion)- NH2_R23
error (neutral)- OH_R236


Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion
...,...,...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion
185,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion
186,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion
187,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion


# Calculate Descriptors

In [31]:
from sklearn.preprocessing import LabelEncoder

data['FG_no']=LabelEncoder().fit_transform(data['FG_name'])
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion,FG_no
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion,0
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion,1
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion,1
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion,14
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion,14
...,...,...,...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion,15
185,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion,15
186,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion,15
187,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion,15


<img src="https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid=4757&t=l" width=200px height=200px>

In [32]:
from rdkit import Chem

In [33]:
data=data.dropna(subset=['mol_filename_neutral'])

In [34]:
from rdkit.Chem import Descriptors
from IPython.display import clear_output
from rdkit.Chem.Descriptors3D import *

desc_3d_list=[('Asphericity',Asphericity),
            ('Eccentricity',Eccentricity),
            ('InertialShapeFactor',InertialShapeFactor),
            ('NPR1',NPR1),
            ('NPR2',NPR2),
            ('PMI1',PMI1),
            ('PMI2',PMI2),
            ('PMI3',PMI3),
            ('RadiusOfGyration',RadiusOfGyration),
            ('SpherocityIndex',SpherocityIndex)
           ]

desc_list=Descriptors.descList+desc_3d_list

for desc_name,desc_func in desc_list:
    desc_value_list=[]
    for row in data.iterrows():
        filename=row[1]['mol_filename_neutral']
        m=Chem.MolFromMolFile(f'molecules/neutral/{filename}.mol')
        desc_value_list.append(desc_func(m))
    data[desc_name]=desc_value_list
clear_output()

In [35]:
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion,FG_no,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,...,Asphericity,Eccentricity,InertialShapeFactor,NPR1,NPR2,PMI1,PMI2,PMI3,RadiusOfGyration,SpherocityIndex
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion,0,4.515185,0.950185,4.515185,...,0.565426,0.984436,0.004060,0.175744,0.824256,203.025746,952.211108,1155.236854,2.590521,3.469194e-10
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion,1,4.694722,0.939167,4.694722,...,0.414850,0.964725,0.001641,0.263258,0.754145,459.636664,1316.701291,1745.953265,2.894724,2.616881e-02
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion,1,4.631574,0.939167,4.631574,...,0.652689,0.991067,0.003116,0.133363,0.880800,282.693153,1867.057219,2119.729166,3.186999,2.147342e-02
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion,14,5.844735,0.671994,5.844735,...,0.480865,0.974914,0.002627,0.222582,0.777534,296.015256,1034.054046,1329.914576,2.672942,1.638895e-04
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion,14,5.700164,0.714890,5.700164,...,0.620758,0.988920,0.003874,0.148451,0.851582,219.819135,1260.983509,1480.753866,2.820395,4.615913e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion,15,11.230655,-1.013112,11.230655,...,0.573428,0.985310,0.000926,0.170773,0.857589,925.652115,4648.439587,5420.359453,3.928670,3.827094e-02
185,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion,15,11.151935,-0.893097,11.151935,...,0.215618,0.852354,0.000234,0.522965,0.525738,2250.280048,2262.214379,4302.929844,3.517875,6.432619e-02
186,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion,15,10.985181,-0.970500,10.985181,...,0.654730,0.991202,0.001018,0.132357,0.891029,874.926498,5890.018831,6610.353674,4.333218,3.206787e-02
187,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion,15,11.524990,-1.915802,11.524990,...,0.258234,0.916344,0.000251,0.400392,0.652081,2601.278722,4236.464502,6496.833550,3.857006,7.038922e-02


In [36]:
data=data.loc[:, (data != data.iloc[0]).any()] 
data

Unnamed: 0,Sr. No.,FG_name,FG Position,Redox Potential,mol_filename_neutral,mol_filename_anion,FG_no,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,...,Asphericity,Eccentricity,InertialShapeFactor,NPR1,NPR2,PMI1,PMI2,PMI3,RadiusOfGyration,SpherocityIndex
0,Phenazine,-,-,-1.74,PHENAZINE_neutral,PHENAZINE_anion,0,4.515185,0.950185,4.515185,...,0.565426,0.984436,0.004060,0.175744,0.824256,203.025746,952.211108,1155.236854,2.590521,3.469194e-10
1,1,2CH3N,1,-1.85,2CH3N_R1_neutral,2CH3N_R1_Anion,1,4.694722,0.939167,4.694722,...,0.414850,0.964725,0.001641,0.263258,0.754145,459.636664,1316.701291,1745.953265,2.894724,2.616881e-02
2,2,2CH3N,2,-1.98,2CH3N_R2_neutral,2CH3N_R2_Anion,1,4.631574,0.939167,4.631574,...,0.652689,0.991067,0.003116,0.133363,0.880800,282.693153,1867.057219,2119.729166,3.186999,2.147342e-02
3,3,NH2,1,-1.85,NH2_R1_neutral,NH2_R1_anion,14,5.844735,0.671994,5.844735,...,0.480865,0.974914,0.002627,0.222582,0.777534,296.015256,1034.054046,1329.914576,2.672942,1.638895e-04
4,4,NH2,2,-1.92,NH2_R2_neutral,NH2_R2_anion,14,5.700164,0.714890,5.700164,...,0.620758,0.988920,0.003874,0.148451,0.851582,219.819135,1260.983509,1480.753866,2.820395,4.615913e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,185,NO2,1 2 6 7,-0.35,NO2_R1267_neutral,NO2_R1267_anion,15,11.230655,-1.013112,11.230655,...,0.573428,0.985310,0.000926,0.170773,0.857589,925.652115,4648.439587,5420.359453,3.928670,3.827094e-02
185,186,NO2,1 4 6 9,-0.52,NO2_R1469_neutral,NO2_R1469_anion,15,11.151935,-0.893097,11.151935,...,0.215618,0.852354,0.000234,0.522965,0.525738,2250.280048,2262.214379,4302.929844,3.517875,6.432619e-02
186,187,NO2,2 3 7 8,-0.52,NO2_R2378_neutral,NO2_R2378_anion,15,10.985181,-0.970500,10.985181,...,0.654730,0.991202,0.001018,0.132357,0.891029,874.926498,5890.018831,6610.353674,4.333218,3.206787e-02
187,188,NO2,1 2 3 4 6 9,0.16,NO2_R123469_neutral,NO2_R123469_anion,15,11.524990,-1.915802,11.524990,...,0.258234,0.916344,0.000251,0.400392,0.652081,2601.278722,4236.464502,6496.833550,3.857006,7.038922e-02


In [37]:
data.to_csv('DATA.csv',index=False)

In [38]:
# from rdkit.Chem import Descriptors3D
# from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

# descList=[d[0] for d in Descriptors.descList ]
# calc=MolecularDescriptorCalculator(descList)
# calc.CalcDescriptors(m)
# #
# list(rdMolDescriptors.Properties().GetPropertyNames())