# Preprocessing for Leffingwell data

### Substantial preprocessing previously done by the team at Google; see leffingwell_readme.pdf
### Preprocessing here is to convert this to the Pyrfume standard format

In [1]:
from itertools import chain
import numpy as np
import pandas as pd
import pyrfume
from pyrfume.odorants import get_cids, from_cids, canonical_smiles, smiles_to_mol
from rdkit.Chem.Descriptors import MolWt
from tqdm.auto import tqdm

In [2]:
# Load the data previously processed by Google form the Leffingwell raw source file (not available here)
raw = pd.read_csv('leffingwell_data.csv').set_index('smiles')

In [3]:
# Obtain the PubChem IDs -- ~100 of these ~3500 molecules cannot be found in PubChem
cids = pyrfume.get_cids(raw.index, kind='smiles')

  0%|          | 0/3523 [00:00<?, ?it/s]

Could not find CCCCOC(C)OC(C)CCC
Could not find CCCC(C)OC(C)OCCC(C)C
Could not find CCCOC(C)OC(C)CCC
Could not find CC(=O)N1CCC=CS1
Could not find Cc1coc(SSSSc2occ(C)c2C)c1C
Could not find CCCC(OCC)OCCC(C)C
Could not find CC(C)CC1(CC(C)C)CSSS1
Could not find CC(C)C1(C(C)C)CSSS1
Could not find CCCN1SSC=C(C)C1(C)C
Could not find CCCC(C)(C)OC(=O)C(O)CC(=O)O
Could not find CCCCCC1C(=O)OC(C)=C1C
Could not find CC1=C(C)C(N2CCCC2)=NSS1
Could not find Cc1coc(C(=O)c2cccs2)c1C
Could not find CCCC(C)C(=S)OCC
Could not find CCN1CCN=C2C=CC=C21
Could not find CCCCCCCCC=CC=CC=CC=CC=O
Could not find CCCCCC(OCC)OCCC
Could not find CCCCC=COC(=O)C(CC)CC
Could not find CCC(C)C(=S)OCC(C)C
Could not find CC(C)CC1(C)C=CSSN1C
Could not find CC(C)CS1=CNCC1
Could not find CCCC(CS)OC=O
Could not find SCC1CC=CS1
Could not find CCCSC(O)(CS)CCC
Could not find CCC(OC)c1cnccn1
Could not find COC(=S)C(C)Cc1ccco1
Could not find CSSC(C)c1ccco1
Could not find CCC(=O)C(C)SSc1ccco1
Could not find CC=CC1(C)CCOSC1(C)C
Could 

In [4]:
# Add the PubChem ID column
# 0 will be used to indicate molecules that could not be found in PubChem
raw['CID'] = raw.index.map(cids.get)

In [5]:
# Canonicalize SMILES strings
raw.index = map(canonical_smiles, raw.index)

In [6]:
# Get standard information from PubChem for the CIDs that were found
info_dict = from_cids(list(set(cids.values())))
# Convert to a DataFrame
info = pd.DataFrame(info_dict).set_index('CID')

  0%|          | 0/35 [00:00<?, ?it/s]

In [8]:
# Join the PubChem standard information with the original data
df = raw.join(info, on='CID', how='left')

# Those smiles associated with no CID
empty_smiles = df[df['CID']==0].index

# Fill 'IsomericSMILES' column for molecules with no CID using original SMILES from index
df.loc[empty_smiles, 'IsomericSMILES'] = df.loc[empty_smiles].index

# Fill 'name' column for molecules with no CID using original `chemical_name`
df.loc[empty_smiles, 'name'] = df.loc[empty_smiles, 'chemical_name']

# No `IUPACName` will be computed for molecules with no CID

# Fill 'MolecularWeight' column for molecules with no CID using SMILES-based MW calculation
mols = smiles_to_mol(empty_smiles, max_attempts=1000)
mws = pd.Series({smiles: MolWt(mol) for smiles, mol in mols.items()})
df.loc[empty_smiles, 'MolecularWeight'] = mws[empty_smiles]
df['MolecularWeight'] = df['MolecularWeight'].astype(float)

  0%|          | 0/93 [00:00<?, ?it/s]

In [9]:
# Create the `molecules` dataframe containing basic information about the molecules
molecules = df[list(info) + ['CID', 'cas']].set_index('CID').copy().sort_index()
molecules.head()

Unnamed: 0_level_0,MolecularWeight,IsomericSMILES,IUPACName,name,cas
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,188.336,CC(C)CC(C)(O)C1CCCS1,,Isopropylmethyltetrahydrothiophenyl-ethanol,1612888-42-2
0,194.23,CCCCC=COC(=O)c1ccco1,,Hexenyl furoate,
0,198.306,CCCCC=COC(=O)C(C)CCC,,Hexenyl methylvalerate,
0,240.387,CCCCC=COC(=O)CCCCCCCC,,Hexenyl nonanoate,88191-46-2
0,223.319,CSCC[C@H](N)C(=O)OC(=O)CS,,Methionyl thioglycolate,852997-30-9


In [10]:
# Create the `behavior` dataframe containing the label data; all applicable labels are contained in the `Labels` column
behavior_sparse = df[['IsomericSMILES', 'odor_data', 'odor_labels_filtered', 'CID']].set_index('CID').sort_index()
behavior_sparse.columns = [['IsomericSMILES', 'Raw Labels', 'Labels']]

In [11]:
# Create a dense version of the above; each label will have its own binary-valued column
behavior = behavior_sparse.copy()
# All the labels in the dataset
all_labels = set(chain.from_iterable(behavior['Labels'].squeeze().map(eval)))
for label in tqdm(all_labels):
    behavior[label] = behavior['Labels'].squeeze().apply(lambda x: label in eval(x)).astype(int)
behavior = behavior.drop(['Raw Labels', 'Labels'], axis=1, level=0).sort_index(axis=1)

  0%|          | 0/113 [00:00<?, ?it/s]

In [12]:
# Write files to disk
molecules.to_csv('molecules.csv')
behavior_sparse.to_csv('behavior_sparse.csv')
behavior.to_csv('behavior.csv')