In [None]:
# External library.
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors
import warnings
warnings.filterwarnings('ignore') # To suppress runtime and deprication warnings. 

In [None]:
# Load raw data.
bitter_train = pd.read_csv('../data/bitter-train.tsv', sep='\t')
bitter_test = pd.read_csv('../data/bitter-test.tsv', sep='\t')
sweet_train = pd.read_csv('../data/sweet-train.tsv', sep='\t')
sweet_test = pd.read_csv('../data/sweet-test.tsv', sep='\t')

df_bitter = pd.DataFrame()
df_sweet = pd.DataFrame()

# Concatenate train and test sets in a single data frame. Useful for k-fold cross validation.
df_bitter = pd.concat([bitter_train[['Name','SMILES','Taste']], bitter_test[['Name','SMILES','Taste']]])
df_sweet = pd.concat([sweet_train[['Name','SMILES','Taste']], sweet_test[['Name','SMILES','Taste']] ])

# Concatenate bitter and sweet dataframes into a single dataframe.
# This data frame may have redundancies which will be removed after
# canonicalising the SMILES because even the same molecule fetched
# from different sources may have different SMILES. 
df = pd.DataFrame()
df = pd.concat([df_bitter, df_sweet], sort=False)

# Rename taste column as Target.
df.rename(columns={'Taste':'Target'}, inplace=True)

# Replace tasteless and non-bitter with non_bitter_sweet. 
df.Target.replace(to_replace="Tasteless", value="Non_Bitter_Sweet", inplace=True)
df.Target.replace(to_replace="Non-bitter", value="Non_Bitter_Sweet", inplace=True)

# Save the dataframe at this stage in case we may need it in this form
# in the future analysis.
df.to_csv("../output/all_data_pre_descriptor.tsv", sep='\t', compression='gzip')
df.to_csv("../output/all_data_pre_descriptor_nogzip.tsv", sep='\t')

In [None]:
print(df.count())
df[df['SMILES'].isna()] # check if any of the SMILES is a NaN.

In [None]:
# Convert SMILES into Canonical SMILES using RDKit.
length = df.shape[0]

name = []
smiles = []
canon_smiles = []
target = []
exceptions = 0
failed_smiles = []
failed_name = []
failed_target = []
for i in range(length):
    try:
        c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
        canon_smiles.append(c)
        smiles.append(str(df.iloc[i]['SMILES']))
        target.append(str(df.iloc[i]['Target']))
        name.append(str(df.iloc[i]['Name']))
    except:
        exceptions += 1
        failed_smiles.append(str(df.iloc[i]['SMILES']))
        failed_name.append(str(df.iloc[i]['Target']))
        failed_target.append(str(df.iloc[i]['Name']))
        continue



In [None]:
print("Total number molecules that failed with exceptions: ", exceptions)
dict_canon_smiles = {'Name':name, 'SMILES':smiles, 'Canonical SMILES':canon_smiles, 'Target':target}
df_canon_smiles = pd.DataFrame(data=dict_canon_smiles)

# Failed molecules
failed_dict_canon_smiles = {'Name':failed_name, 'SMILES':failed_smiles, 'Target':failed_target}
failed_df_canon_smiles = pd.DataFrame(data=failed_dict_canon_smiles)

In [None]:
failed_df_canon_smiles.head

In [None]:
df_canon_smiles[df_canon_smiles['Canonical SMILES'].isna()] # Check if any of the Canonical SMILES is a NaN.

In [None]:
# Check for redundancy.
df_canon_smiles.drop_duplicates(subset="Canonical SMILES", inplace=True)

# Reset index of the dataframe.
df_canon_smiles.reset_index(drop=True, inplace = True)

# Report target stats.
print(df_canon_smiles.groupby("Target").nunique())

print("Total number of molecules:", df_canon_smiles.shape[0])

In [None]:
df_canon_smiles[df_canon_smiles['Canonical SMILES'].isna()] # Check if any of the canonical SMILES is a NaN 
                                                            # after redundancy check and filter.

In [None]:
# Save data after redundancy check and filtering.
df_canon_smiles.to_csv("../output/df_canon_smiles.tsv", sep='\t', compression='gzip') # For archiving.
df_canon_smiles.to_csv("../output/df_canon_smiles_no_gzip.tsv", sep='\t') # For archiving.
failed_df_canon_smiles.to_csv("../output/failed_df_canon_smiles_no_gzip.tsv", sep='\t') # For archiving.

In [None]:
# Prepare for 2D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=True)
print("Total number of 2D descriptors:", len(calc.descriptors))
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print("Total number of molecules for descriptor calculation:", len(mols))

In [None]:
# Calculate 2D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols, quiet = True, ipynb = True)

In [None]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../output/bitter_sweet_2d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../output/bitter_sweet_2d_descriptors.pkl.gz', compression='gzip')

In [None]:
# Prepare for 2D + 3D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=False)
print("Total number of 3D descriptors:", len(calc.descriptors))
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print("Total number of molecules for descriptor calculation:", len(mols))

In [None]:
# Calculate 2D + 3D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols, quiet = True, ipynb = True)

In [None]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../output/bitter_sweet_2d_plus_3d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../output/bitter_sweet_2d_plus_3d_descriptors.pkl.gz', compression='gzip')

In [None]:
df_save.to_csv('../output/bitter_sweet_2d_plus_3d_descriptors_nogzip.tsv', sep='\t')