In [1]:
# External library.
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors

In [86]:
# Load raw data.
bitter_train = pd.read_csv('../data/bitter-train.tsv', sep='\t')
bitter_test = pd.read_csv('../data/bitter-test.tsv', sep='\t')
sweet_train = pd.read_csv('../data/sweet-train.tsv', sep='\t')
sweet_test = pd.read_csv('../data/sweet-test.tsv', sep='\t')

df_bitter = pd.DataFrame()
df_sweet = pd.DataFrame()

# Concatenate train and test sets in a single data frame. Useful for k-fold cross validation.
df_bitter = pd.concat([bitter_train[['Name','SMILES','Taste']], bitter_test[['Name','SMILES','Taste']]])
df_sweet = pd.concat([sweet_train[['Name','SMILES','Taste']], sweet_test[['Name','SMILES','Taste']] ])

# Concatenate bitter and sweet dataframes into a single dataframe.
# This data frame may have redundancies which will be removed after
# canonicalising the SMILES because even the same molecule fetched
# from different sources may have different SMILES. 
df = pd.DataFrame()
df = pd.concat([df_bitter, df_sweet], sort=False)

# Rename taste column as Target.
df.rename(columns={'Taste':'Target'}, inplace=True)

# Replace tasteless and non-bitter with ambiguous. 
df.Target.replace(to_replace="Tasteless", value="Ambiguous", inplace=True)
df.Target.replace(to_replace="Non-bitter", value="Ambiguous", inplace=True)

# Save the dataframe at this stage in case we may need it in this form
# in the future analysis.
df.to_csv("../data/all_data_pre_descriptor.tsv", sep='\t', compression='gzip')

In [87]:
# Convert SMILES into Canonical SMILES using RDKit.
length = df.shape[0]

name = []
smiles = []
canon_smiles = []
target = []
exceptions = 0
for i in range(length):
    try:
        c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
        canon_smiles.append(c)
        smiles.append(str(df.iloc[i]['SMILES']))
        target.append(str(df.iloc[i]['Target']))
        name.append(str(df.iloc[i]['Name']))
    except:
        exceptions += 1
        continue

print("Total number molecules that failed with exceptions: ", exceptions)
dict_canon_smiles = {'Name':name, 'SMILES':smiles, 'Canonical SMILES':canon_smiles, 'Target':target}
df_canon_smiles = pd.DataFrame(data=dict_canon_smiles)

Total number molecules that failed with exceptions:  48


In [88]:
# Check for redundancy.
df_canon_smiles.drop_duplicates(subset="Canonical SMILES", inplace=True)

# Report target stats.
print(df_canon_smiles.groupby("Target").nunique())

print("Total number of data points:", df_canon_smiles.shape[0])

           Name  SMILES  Canonical SMILES  Target
Target                                           
Ambiguous   237     273               273       1
Bitter      806     892               892       1
Sweet      1172    1228              1228       1
Total number of data points: 2393


In [89]:
# Save data after redundancy check and filtering.
df_canon_smiles.to_pickle('../data/df_canon_smiles.pkl') # For faster execution.
df_canon_smiles.to_csv("../data/df_canon_smiles.tsv", sep='\t', compression='gzip') # For archiving.

In [91]:
# Prepare for 2D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=True)
print("Total number of 2D descriptors:", len(calc.descriptors))
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print("Total number of molecules for descriptor calculation:", len(mols))

Total number of 2D descriptors: 1613
Total number of molecules for descriptor calculation: 2393


In [92]:
# Calculate 2D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols)
df_desc.to_pickle('../data/df_2d_descriptors.pkl.gz', compression='gzip')

  1%|          | 23/2393 [00:02<06:45,  5.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|██▌       | 620/2393 [00:15<02:36, 11.31it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 38%|███▊      | 915/2393 [00:19<00:35, 41.74it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▉      | 937/2393 [00:20<01:27, 16.70it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▉      | 941/2393 [00:22<03:03,  7.92it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|███▉      | 949/2393 [00:22<02:49,  8.54it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|████      | 982/2393 [00:25<03:22,  6.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|████      | 985/2393 [00:25<02:35,  9.05it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 61%|██████    | 1458/2393 [00:46<00:25, 36.36it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 73%|███████▎  | 1741/2393 [00:53<00:22, 29.37it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 97%|█████████▋| 2323/2393 [01:09<00:05, 12.19it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2393/2393 [01:10<00:00, 34.03it/s]


In [93]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../data/bitter_sweet_2d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../data/bitter_sweet_2d_descriptors.pkl.gz', compression='gzip')

In [97]:
# Prepare for 2D + 3D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=False)
print("Total number of 3D descriptors:", len(calc.descriptors))
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print("Total number of molecules for descriptor calculation:", len(mols))

Total number of 3D descriptors: 1826
Total number of molecules for descriptor calculation: 2393


In [98]:
# Calculate 2D + 3D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols)
df_desc.to_pickle('../data/df_2d_plus_3d_descriptors.pkl.gz', compression='gzip')

  1%|          | 23/2393 [00:02<06:28,  6.10it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▊      | 922/2393 [00:20<00:35, 41.45it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▉      | 935/2393 [00:21<01:31, 15.86it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|███▉      | 949/2393 [00:23<02:17, 10.49it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|████      | 982/2393 [00:25<03:05,  7.63it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|████      | 985/2393 [00:25<02:24,  9.74it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 55%|█████▌    | 1327/2393 [00:43<00:52, 20.26it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 56%|█████▌    | 1342/2393 [00:44<01:06, 15.72it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 61%|██████    | 1457/2393 [00:46<00:22, 40.91it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 67%|██████▋   | 1609/2393 [00:50<00:26, 29.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2393/2393 [01:12<00:00, 33.05it/s]


In [99]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../data/bitter_sweet_2d_plus_3d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../data/bitter_sweet_2d_plus_3d_descriptors.pkl.gz', compression='gzip')