# SDF to hdf

In [1]:
import zipfile
import tempfile
import glob

from rdkit import Chem
from rdkit.Chem import PandasTools as pdt
from rdkit.Chem import AllChem


In [2]:
import pandas as pd

In [17]:
filename = # File containing a list of smiles as .csv

In [3]:

data = pd.read_csv(filename, sep=' ',
                   names=['smiles','SA number'])
data.head()

Unnamed: 0,smiles,SA number
0,c1c([nH]cn1)CCN,2792727428711615287607702876230328811120288214303943982084773919356187781992612325
1,c1ccc(cc1)/C=C/C(=O)Nc2ccc3c(c2)c(=O)cc(o3)c4ccccc4,100863698
2,CC(C(=O)Nc1ccc(cc1)NC(=O)C)Sc2nnnn2c3ccccc3,29949812
3,c1ccc(cc1)S(=O)(=O)N2CCCc3c2ccc(c3)C(=O)NC4CCCCCCC4,111189952
4,Cc1cc(=O)[nH]c(n1)N2CCC(CC2)NC(=O)CCc3[nH]c4ccccc4n3,458947156


In [4]:
len(data)

8264787

In [5]:
def smi2mol(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except:
        return pd.np.nan
    
def smi2inchi(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.MolToInchiKey(mol)
    else:
        return pd.np.nan 
    
romol = data.smiles.apply(smi2inchi)

In [6]:
data["inchi_key"] = romol
del romol

In [7]:
#Remove failed smiles conversions
mask = pd.isna(data.inchi_key)
failed = data[mask]
data = data[~mask]

In [11]:
print(len(failed))
failed.head() 

431


Unnamed: 0,smiles,SA number,inchi_key
61808,c1ccc(cc1)[P+](c2ccccc2)(c3ccccc3)C4C5C(C=C(C4=O)c6cccc(c6)F)[N](=O)c7ccccc7[N]5=O,128036082,
62163,CCc1cc2c(cc1OCC(=O)N[C@@H](C(C)C)C(=O)O)occ(c2=O)C3=[N](C=NN3)c4ccccc4,150455978,
62228,c1ccc(cc1)CC[N]2=CNc3c(c4c(n3/N=C/c5ccc(cc5)F)nc6ccccc6n4)C2=O,234133637,
63992,COCC[N]1=CNc2c(c3c(n2/N=C/c4ccc(cc4)F)nc5ccccc5n3)C1=O,234133623,
65322,CC1=CC(=C)NC2=[N]1NC(=CN3C(=O)c4ccccc4C3=O)N2,410509234,


In [10]:
data.head()

Unnamed: 0,smiles,SA number,inchi_key
0,c1c([nH]cn1)CCN,2792727428711615287607702876230328811120288214303943982084773919356187781992612325,NTYJJOPFIAHURM-UHFFFAOYSA-N
1,c1ccc(cc1)/C=C/C(=O)Nc2ccc3c(c2)c(=O)cc(o3)c4ccccc4,100863698,OPLJSNOMUAQEHB-SDNWHVSQSA-N
2,CC(C(=O)Nc1ccc(cc1)NC(=O)C)Sc2nnnn2c3ccccc3,29949812,VRSROEQQOQWWRP-UHFFFAOYSA-N
3,c1ccc(cc1)S(=O)(=O)N2CCCc3c2ccc(c3)C(=O)NC4CCCCCCC4,111189952,HWHPXVZTRRVKDG-UHFFFAOYSA-N
4,Cc1cc(=O)[nH]c(n1)N2CCC(CC2)NC(=O)CCc3[nH]c4ccccc4n3,458947156,KUBBSKDNILBLHX-UHFFFAOYSA-N


## Clear duplicates and use inchi_key as index.

In [9]:
sum(data.inchi_key.value_counts() > 1)

8526

In [13]:
import time
time0 = time.time()
"DGEKNIYQAIAEGO-VKKIDBQXSA-N" in data.inchi_key.values
print(time.time()-time0)

0.18393349647521973


In [14]:
s = set(data.inchi_key.values)
time0 = time.time()
"DGEKNIYQAIAEGO-VKKIDBQXSA-N" in s
print(time.time()-time0)

6.985664367675781e-05


In [15]:
len(s)

8254810

In [18]:
data.to_hdf('%s.h5'%filename, 'table', complevel=9)

In [19]:
# Check folder containing output file
!ls -ltr # Path to folder

total 1134464
-rw-r----- 1 kfjb203 xem-scp-cc 448551863 Feb  7 15:58 sigma_aldrich_export_prod_20181201.smi
-rw-rw---- 1 kfxl284 xem-scp-cc 730898620 Feb  8 10:41 sigma_aldrich_export_prod_20181201.smi.h5
