# DESCRIPTOR GENERATION

In [85]:
from mlchem.chem.calculator import descriptors
from mlchem.chem.manipulation import create_molecule
from mlchem.helper import dfs_to_excel
import pandas as pd
import numpy as np
data = pd.read_csv('../data/data.csv')

### RDKit 

In [87]:
desc_rdkit = descriptors.get_rdkitDesc(data.SMILES,include_3D=False)
desc_rdkit.drop_duplicates(inplace=True)
print(desc_rdkit.isna().sum().sum())

columns_to_keep = []
for c in desc_rdkit.columns:
    try:
        pd.to_numeric(desc_rdkit[c])
        columns_to_keep.append(c)
    except Exception:
        pass
desc_rdkit = desc_rdkit[columns_to_keep]

desc_rdkit.replace([np.inf, -np.inf], np.nan, inplace=True)
desc_rdkit.dropna(axis=1,inplace=True) # axis 0: rows, axis 1: columns

print(desc_rdkit.isna().sum().sum())


desc_rdkit.index.rename('SMILES',inplace=True)
desc_rdkit.to_csv('../data/data_rdkit.csv')
desc_rdkit

116
0


Unnamed: 0_level_0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCCC=C=NCCCCO,8.437125,8.437125,0.272018,0.272018,0.460034,8.909091,155.241,138.105,155.131014,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
CCCCCOCCCC=C=NCCCCO,8.559624,8.559624,0.258536,0.258536,0.421419,10.000000,241.375,214.159,241.204179,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
CCCOCC(C)O,8.653994,8.653994,0.318009,-0.318009,0.552991,13.875000,118.176,104.064,118.099380,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CC(CS#CCO)OCCCC=CN,8.460126,8.460126,0.001308,-0.001308,0.659917,12.714286,217.334,198.182,217.113650,82.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
CS#CCCNCCO,8.334665,8.334665,0.221496,0.221496,0.559815,8.666667,147.243,134.139,147.071785,54.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C=NOC(CC=O)C[SH]=COCCCCC,10.310577,10.310577,0.187816,-0.187816,0.151666,13.062500,247.360,226.192,247.124215,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
CCCC=C=CCC(C)CC,3.201528,3.201528,0.817154,0.817154,0.523454,11.909091,152.281,132.121,152.156501,64.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
C1=CCCC1,2.236111,2.236111,1.319444,1.319444,0.379082,20.800000,68.119,60.055,68.062600,28.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CCCCCCCCCOCC,5.261202,5.261202,0.870868,0.870868,0.480948,10.500000,172.312,148.120,172.182715,74.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0


### Mordred

In [14]:
desc_mordred = descriptors.get_mordredDesc(data.SMILES)

##### Some descriptors returns an error, include only those descriptors that can be converted to numbers

In [60]:
columns_to_keep = []
for c in desc_mordred.columns:
    try:
        pd.to_numeric(desc_mordred[c])
        columns_to_keep.append(c)
    except Exception:
        pass
desc_mordred_filtered = desc_mordred[columns_to_keep]
desc_mordred_filtered

Unnamed: 0,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,VE1_A,VE2_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
OOC1=CC=CC=C1,0,0,10.424292,2.135779,4.271558,10.424292,1.303037,2.969338,2.718002,0.33975,...,8.298291,35.247635,110.036779,7.85977,64,7,34.0,36.0,2.611111,2.0
CCCC1=CC=CC=C1,0,0,11.384646,2.148961,4.297922,11.384646,1.264961,3.08002,2.824369,0.313819,...,8.379998,36.722228,120.0939,5.718757,94,8,38.0,40.0,2.861111,2.25
CCC=CCCC1=CC=CC=C1,0,0,15.459017,2.157647,4.315295,15.459017,1.288251,3.353323,3.033222,0.252768,...,8.541886,40.851862,160.125201,5.718757,242,11,50.0,52.0,3.611111,3.0
COCCCCC1=CC=CC=C1,0,0,15.459017,2.157647,4.315295,15.459017,1.288251,3.353323,3.033222,0.252768,...,8.541886,40.851862,164.120115,5.861433,242,11,50.0,52.0,3.611111,3.0
C1=CC=CC=C1,0,0,8.0,2.0,4.0,8.0,1.333333,2.687624,2.44949,0.408248,...,7.627057,30.941317,78.04695,6.503913,27,3,24.0,24.0,1.5,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C(O)CC1=CC=CC=C1CC1C=C1,1,0,17.787521,2.347719,4.629396,17.787521,1.270537,3.602532,3.418916,0.244208,...,9.332646,63.559958,188.08373,7.23399,312,16,70.0,79.0,4.444444,3.111111
CCCOC1=CC=CC=C1CC(=O)O,1,0,17.070407,2.279983,4.559965,17.070407,1.219315,3.514345,3.281147,0.234368,...,9.05357,44.461636,194.094294,6.931939,324,16,62.0,67.0,5.333333,3.361111
C=NCCCCCCC1=CC=CC=C1CC(=O)O,1,0,22.15957,2.281196,4.562393,22.15957,1.231087,3.754486,3.371139,0.187285,...,9.165029,49.367938,247.157229,6.337365,746,20,78.0,83.0,6.333333,4.361111
CCCC#SCC1=CC=CC=C1CC(=O)O,1,0,19.61339,2.281065,4.56213,19.61339,1.225837,3.641607,3.344961,0.20906,...,9.110851,46.936734,236.087101,7.377722,503,18,70.0,75.0,5.833333,3.861111


### Fingerprints

In [70]:
desc_fp_m = descriptors.get_fingerprint_df(data.SMILES,fp_type='m')
desc_fp_m

Unnamed: 0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,...,m2039,m2040,m2041,m2042,m2043,m2044,m2045,m2046,m2047,m2048
OOC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCC=CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COCCCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C(O)CC1=CC=CC=C1CC1C=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
CCCOC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C=NCCCCCCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCC#SCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
desc_fp_ap = descriptors.get_fingerprint_df(data.SMILES,fp_type='ap')
desc_fp_ap

Unnamed: 0,ap1,ap2,ap3,ap4,ap5,ap6,ap7,ap8,ap9,ap10,...,ap2039,ap2040,ap2041,ap2042,ap2043,ap2044,ap2045,ap2046,ap2047,ap2048
OOC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCC=CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COCCCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C(O)CC1=CC=CC=C1CC1C=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCOC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C=NCCCCCCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCC#SCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
desc_fp_tt = descriptors.get_fingerprint_df(data.SMILES,fp_type='tt')
desc_fp_tt

Unnamed: 0,tt1,tt2,tt3,tt4,tt5,tt6,tt7,tt8,tt9,tt10,...,tt2039,tt2040,tt2041,tt2042,tt2043,tt2044,tt2045,tt2046,tt2047,tt2048
OOC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCC=CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COCCCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C(O)CC1=CC=CC=C1CC1C=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCOC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C=NCCCCCCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCC#SCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
desc_fp_rk = descriptors.get_fingerprint_df(data.SMILES,fp_type='rk')
desc_fp_rk

Unnamed: 0,rk1,rk2,rk3,rk4,rk5,rk6,rk7,rk8,rk9,rk10,...,rk2039,rk2040,rk2041,rk2042,rk2043,rk2044,rk2045,rk2046,rk2047,rk2048
OOC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCC=CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COCCCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C(O)CC1=CC=CC=C1CC1C=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCCOC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C=NCCCCCCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
CCCC#SCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
desc_fp_maccs = descriptors.get_fingerprint_df(data.SMILES,fp_type='mac')
desc_fp_maccs

Unnamed: 0,mac1,mac2,mac3,mac4,mac5,mac6,mac7,mac8,mac9,mac10,...,mac158,mac159,mac160,mac161,mac162,mac163,mac164,mac165,mac166,mac167
OOC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
CCC=CCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0
COCCCCC1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,1,1,1,0
C1=CC=CC=C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C(O)CC1=CC=CC=C1CC1C=C1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
CCCOC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
C=NCCCCCCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
CCCC#SCC1=CC=CC=C1CC(=O)O,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


### Chemotypes

In [71]:
desc_chemotypes = descriptors.get_chemotypes(data.SMILES)
desc_chemotypes

Unnamed: 0,Carbon,Carbanion,Carbocation,Carbon > 40% tot atoms,Carbon > 60% tot atoms,Carbon > 80% tot atoms,Aromatic Carbon,Aromatic Carbon > 10% tot atoms,Aromatic Carbon > 30% tot atoms,Aromatic Carbon > 50% tot atoms,...,O-Heterocycle (Aromatic),S-Heterocycle,S-Heterocycle (Aromatic),Zwitterion,H-Bond acceptors,H-Bond acceptors > 2,H-Bond acceptors > 5,H-Bond donors,H-Bond donors > 2,H-Bond donors > 5
0,True,False,False,False,False,False,True,True,True,False,...,False,False,False,False,True,False,False,True,False,False
1,True,False,False,False,False,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,True,True,False,False,...,False,False,False,False,True,False,False,False,False,False
4,True,False,False,True,False,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,True,False,False,False,False,False,True,True,False,False,...,False,False,False,False,True,False,False,True,False,False
369,True,False,False,False,False,False,True,True,False,False,...,False,False,False,False,True,True,False,True,False,False
370,True,False,False,False,False,False,True,True,False,False,...,False,False,False,False,True,True,False,True,False,False
371,True,False,False,False,False,False,True,True,False,False,...,False,False,False,False,True,False,False,True,False,False


In [73]:
desc_chemotypes_binary = 1*desc_chemotypes
desc_chemotypes_binary

Unnamed: 0,Carbon,Carbanion,Carbocation,Carbon > 40% tot atoms,Carbon > 60% tot atoms,Carbon > 80% tot atoms,Aromatic Carbon,Aromatic Carbon > 10% tot atoms,Aromatic Carbon > 30% tot atoms,Aromatic Carbon > 50% tot atoms,...,O-Heterocycle (Aromatic),S-Heterocycle,S-Heterocycle (Aromatic),Zwitterion,H-Bond acceptors,H-Bond acceptors > 2,H-Bond acceptors > 5,H-Bond donors,H-Bond donors > 2,H-Bond donors > 5
0,1,0,0,0,0,0,1,1,1,0,...,0,0,0,0,1,0,0,1,0,0
1,1,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0
369,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,0,1,0,0
370,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,1,0,1,0,0
371,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0


### Save molecular descriptors, fingerprints and chemotypes in a single file

In [76]:
dfs_to_excel('all_descriptors.xlsx',dfs=[desc_rdkit,
                                         desc_mordred_filtered,
                                         desc_fp_m,
                                         desc_fp_ap,
                                         desc_fp_tt,
                                         desc_fp_rk,
                                         desc_fp_maccs,
                                         desc_chemotypes_binary],
                                     sheet_names=['rdkit',
                                                  'mordred',
                                                  'fp_morgan',
                                                  'fp_atompair',
                                                  'fp_toptorsion',
                                                  'fp_rdkit',
                                                  'fp_maccs',
                                                  'chemotypes'])

### Atomic

In [None]:
descriptors.get_atomicDesc('CCCO',3)

Unnamed: 0,SMILES,SMILES_H,SYMBOL,total_degree,total_valence,formal_charge,is_SP,is_SP2,is_SP3,tot_single_b,...,min_gasteiger_charge_neighbours,avg_gasteiger_charge_neighbours2,tot_gasteiger_charge_neighbours2,max_gasteiger_charge_neighbours2,min_gasteiger_charge_neighbours2,avg_gasteiger_charge_neighbours3,total_gasteiger_charge_neighbours3,max_gasteiger_charge_neighbours3,min_gasteiger_charge_neighbours3,average_eucl_dist_in_mol
CCCO,CCCO,[H]OC([H])([H])C([H])([H])C([H])([H])[H],O,2,2,0,0,0,1,1,...,0.042789,0.042789,0.042789,0.042789,0.042789,0.042789,0.042789,0.042789,0.042789,2.326693


In [61]:
descriptors.get_atomicDesc('CCCC[N+](=O)[O-]',4)

Unnamed: 0,SMILES,SMILES_H,SYMBOL,total_degree,total_valence,formal_charge,is_SP,is_SP2,is_SP3,tot_single_b,...,min_gasteiger_charge_neighbours,avg_gasteiger_charge_neighbours2,tot_gasteiger_charge_neighbours2,max_gasteiger_charge_neighbours2,min_gasteiger_charge_neighbours2,avg_gasteiger_charge_neighbours3,total_gasteiger_charge_neighbours3,max_gasteiger_charge_neighbours3,min_gasteiger_charge_neighbours3,average_eucl_dist_in_mol
CCCC[N+](=O)[O-],CCCC[N+](=O)[O-],[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H]...,N,3,4,1,0,1,0,2,...,-0.264549,-0.108563,-0.325689,0.203408,-0.264549,-0.108563,-0.325689,0.203408,-0.264549,3.073122


### Quantum chemical descriptors

In [59]:
mol = create_molecule('c1ccccc1CCCO',is_3d=True)
descriptors.get_EHT_descriptors(mol)

{'AtomicCharges': array([-0.34463776, -0.20701019,  0.07269296, -0.22915957, -0.30144556,
         0.42780534,  0.42540695,  0.1474089 ,  1.12137599, -1.11243708]),
 'Hamiltonian': array([[-2.14000000e+01, -0.00000000e+00, -0.00000000e+00, ...,
          3.30342147e-03, -3.85947647e-04, -1.45563368e-04],
        [-0.00000000e+00, -1.14000000e+01, -0.00000000e+00, ...,
          3.92689415e-03, -4.93958766e-04, -1.86300661e-04],
        [-0.00000000e+00, -0.00000000e+00, -1.14000000e+01, ...,
         -4.93958766e-04, -2.43310690e-04,  2.17660090e-05],
        ...,
        [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
         -1.48000000e+01, -0.00000000e+00, -0.00000000e+00],
        [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
         -0.00000000e+00, -1.48000000e+01, -0.00000000e+00],
        [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
         -0.00000000e+00, -0.00000000e+00, -1.48000000e+01]]),
 'OrbitalEnergies': array([-34.02113492, -29.8156