In [1]:
# External library.
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors

In [2]:
# Load raw data.
bitter_train = pd.read_csv('../data/bitter-train.tsv', sep='\t')
bitter_test = pd.read_csv('../data/bitter-test.tsv', sep='\t')
sweet_train = pd.read_csv('../data/sweet-train.tsv', sep='\t')
sweet_test = pd.read_csv('../data/sweet-test.tsv', sep='\t')

df_bitter = pd.DataFrame()
df_sweet = pd.DataFrame()

# Concatenate train and test sets in a single data frame. Useful for k-fold cross validation.
df_bitter = pd.concat([bitter_train[['Name','SMILES','Taste']], bitter_test[['Name','SMILES','Taste']]])
df_sweet = pd.concat([sweet_train[['Name','SMILES','Taste']], sweet_test[['Name','SMILES','Taste']] ])

# Concatenate bitter and sweet dataframes into a single dataframe.
# This data frame may have redundancies which will be removed after
# canonicalising the SMILES because even the same molecule fetched
# from different sources may have different SMILES. 
df = pd.DataFrame()
df = pd.concat([df_bitter, df_sweet], sort=False)

# Rename taste column as Target.
df.rename(columns={'Taste':'Target'}, inplace=True)

# Replace tasteless and non-bitter with ambiguous. 
df.Target.replace(to_replace="Tasteless", value="Ambiguous", inplace=True)
df.Target.replace(to_replace="Non-bitter", value="Ambiguous", inplace=True)

# Save the dataframe at this stage in case we may need it in this form
# in the future analysis.
df.to_csv("../output/all_data_pre_descriptor.tsv", sep='\t', compression='gzip')

In [3]:
print(df.count())
df[df['SMILES'].isna()] # check if any of the SMILES is a NaN.

Name      4557
SMILES    4794
Target    4794
dtype: int64


Unnamed: 0,Name,SMILES,Target


In [4]:
# Convert SMILES into Canonical SMILES using RDKit.
length = df.shape[0]

name = []
smiles = []
canon_smiles = []
target = []
exceptions = 0
for i in range(length):
    try:
        c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
        canon_smiles.append(c)
        smiles.append(str(df.iloc[i]['SMILES']))
        target.append(str(df.iloc[i]['Target']))
        name.append(str(df.iloc[i]['Name']))
    except:
        exceptions += 1
        continue

print("Total number molecules that failed with exceptions: ", exceptions)
dict_canon_smiles = {'Name':name, 'SMILES':smiles, 'Canonical SMILES':canon_smiles, 'Target':target}
df_canon_smiles = pd.DataFrame(data=dict_canon_smiles)

Total number molecules that failed with exceptions:  48


In [5]:
df_canon_smiles[df_canon_smiles['Canonical SMILES'].isna()] # Check if any of the Canonical SMILES is a NaN.

Unnamed: 0,Name,SMILES,Canonical SMILES,Target


In [6]:
# Check for redundancy.
df_canon_smiles.drop_duplicates(subset="Canonical SMILES", inplace=True)

# Reset index of the dataframe.
df_canon_smiles.reset_index(drop=True, inplace = True)

# Report target stats.
print(df_canon_smiles.groupby("Target").nunique())

print("Total number of molecules:", df_canon_smiles.shape[0])

           Name  SMILES  Canonical SMILES  Target
Target                                           
Ambiguous   237     273               273       1
Bitter      806     892               892       1
Sweet      1172    1228              1228       1
Total number of molecules: 2393


In [7]:
df_canon_smiles[df_canon_smiles['Canonical SMILES'].isna()] # Check if any of the canonical SMILES is a NaN 
                                                            # after redundancy check and filter.

Unnamed: 0,Name,SMILES,Canonical SMILES,Target


In [8]:
# Save data after redundancy check and filtering.
df_canon_smiles.to_csv("../output/df_canon_smiles.tsv", sep='\t', compression='gzip') # For archiving.

In [9]:
# Prepare for 2D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=True)
print("Total number of 2D descriptors:", len(calc.descriptors))
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print("Total number of molecules for descriptor calculation:", len(mols))

Total number of 2D descriptors: 1613
Total number of molecules for descriptor calculation: 2393


In [10]:
# Calculate 2D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols)

  1%|          | 23/2393 [00:02<06:47,  5.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|██▌       | 623/2393 [00:16<02:01, 14.56it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▊      | 923/2393 [00:19<00:32, 45.12it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▉      | 936/2393 [00:21<01:31, 15.86it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▉      | 941/2393 [00:22<02:42,  8.94it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|███▉      | 949/2393 [00:22<02:27,  9.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|████      | 985/2393 [00:25<02:26,  9.64it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 50%|█████     | 1208/2393 [00:41<25:59,  1.32s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 61%|██████    | 1457/2393 [00:45<00:28, 33.12it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 67%|██████▋   | 1615/2393 [00:49<00:20, 37.32it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 83%|████████▎ | 1982/2393 [00:59<00:10, 39.89it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2393/2393 [01:09<00:00, 34.48it/s]


In [11]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../output/bitter_sweet_2d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../output/bitter_sweet_2d_descriptors.pkl.gz', compression='gzip')

In [12]:
# Prepare for 2D + 3D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=False)
print("Total number of 3D descriptors:", len(calc.descriptors))
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print("Total number of molecules for descriptor calculation:", len(mols))

Total number of 3D descriptors: 1826
Total number of molecules for descriptor calculation: 2393


In [13]:
# Calculate 2D + 3D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols)

  1%|          | 23/2393 [00:02<07:03,  5.59it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 26%|██▌       | 620/2393 [00:17<03:08,  9.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 38%|███▊      | 914/2393 [00:23<00:35, 42.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 39%|███▉      | 940/2393 [00:25<03:08,  7.69it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 40%|████      | 959/2393 [00:26<02:01, 11.80it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 41%|████      | 985/2393 [00:29<02:39,  8.84it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 49%|████▉     | 1173/2393 [00:50<54:42,  2.69s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 61%|██████    | 1459/2393 [00:56<00:26, 35.55it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 64%|██████▎   | 1520/2393 [00:57<00:18, 46.66it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 67%|██████▋   | 1615/2393 [00:59<00:21, 35.83it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 2393/2393 [01:19<00:00, 30.11it/s]


In [14]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../output/bitter_sweet_2d_plus_3d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../output/bitter_sweet_2d_plus_3d_descriptors.pkl.gz', compression='gzip')