In [4]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, SaltRemover, rdmolfiles
from mordred import Calculator, descriptors as all_descriptors

In [47]:
smiles = pd.read_csv('data/snitz-odorant-info.csv').set_index('CID')['IsomericSMILES']

In [49]:
calc = Calculator(all_descriptors)
print("Convering SMILES string to Mol format...")
mols = {cid: Chem.MolFromSmiles(smi) for cid, smi in smiles.items()}
print("Computing 3D coordinates...")
s = SaltRemover.SaltRemover()
for i, (cid, mol) in enumerate(mols.items()):
    if i > 0 and i % 100 == 0:
        print("Finished %d" % i)
    try:
        mol.SetProp("_Name","%d: %s" % (cid, smiles[cid]))
        mol = s.StripMol(mol,dontRemoveEverything=True)
        mol = Chem.AddHs(mol)
        AllChem.Compute2DCoords(mol)
        AllChem.EmbedMolecule(mol)
        AllChem.UFFOptimizeMolecule(mol) # Is this deterministic?  
    except Exception as e:
        print('Exception for %d' % cid)
        mols[cid] = None
    else:
        mols[cid] = mol
mols = {cid: mol for cid, mol in mols.items() if mol}

Convering SMILES string to Mol format...
Computing 3D coordinates...


In [50]:
len(set(smiles.index))

86

In [51]:
results = calc.pandas(mols.values())
results = results.set_index(pd.Index(mols.keys(), name='CID'))
results.head()

100%|██████████| 86/86 [00:02<00:00, 34.22it/s]


Unnamed: 0_level_0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5634,8.704061,7.968331,1,0,15.322595,1.98289,3.965779,15.322595,1.178661,3.379464,...,8.014666,40.74339,184.14633,5.580192,354,10,48.0,46.0,5.361111,3.333333
7685,9.589507,8.788034,0,0,15.688441,2.243107,4.486214,15.688441,1.206803,3.450509,...,9.034438,43.3681,178.09938,6.596273,268,15,60.0,65.0,5.694444,2.944444
31252,5.875634,5.525875,0,0,9.924777,2.170086,4.340173,9.924777,1.240597,2.97973,...,8.463159,35.730685,108.068748,6.754297,62,7,36.0,38.0,3.222222,1.833333
5283349,7.071068,6.765664,0,0,13.191508,1.931852,3.863703,13.191508,1.199228,3.202455,...,7.601402,37.236738,152.120115,5.634078,220,8,38.0,36.0,4.25,3.0
7710,7.887564,7.597369,0,0,13.522018,2.237474,4.261308,13.522018,1.229274,3.28152,...,8.547334,53.261417,156.11503,5.782038,181,9,48.0,51.0,3.972222,2.666667


In [52]:
results.shape

(86, 1825)

In [53]:
def fix(x):
    try:
        x = float(x)
    except:
        x = None
    return x

results = results.applymap(fix)

In [54]:
frac_bad = results.isnull().mean()
good = frac_bad[frac_bad<0.3].index
results = results.loc[:, good]

In [55]:
from fancyimpute import KNN
knn = KNN(k=5)
results[:] = knn.fit_transform(results.values)

Imputing row 1/86 with 2 missing, elapsed time: 0.044


In [56]:
results.to_csv('data/snitz-mordred.csv')

In [57]:
results.shape

(86, 1605)