In [None]:
# External library.
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors

In [None]:
# Load raw data.
bitter_train = pd.read_csv('../data/bitter-train.tsv', sep='\t')
bitter_test = pd.read_csv('../data/bitter-test.tsv', sep='\t')
sweet_train = pd.read_csv('../data/sweet-train.tsv', sep='\t')
sweet_test = pd.read_csv('../data/sweet-test.tsv', sep='\t')

df_bitter = pd.DataFrame()
df_sweet = pd.DataFrame()

df_bitter = pd.concat([bitter_train[['Name','SMILES', 'Bitter']], bitter_test[['Name','SMILES', 'Bitter']]])
df_sweet = pd.concat([sweet_train[['Name','SMILES', 'Sweet']], sweet_test[['Name','SMILES', 'Sweet']] ])

df_bitter.rename(columns={'Bitter':'Target'}, inplace=True)
df_sweet.rename(columns={'Sweet':'Target'}, inplace=True)

df_bitter.Target.replace(to_replace=True, value='Bitter', inplace=True)
df_bitter.Target.replace(to_replace=False, value='Sweet', inplace=True)
df_sweet.Target.replace(to_replace=True, value='Sweet', inplace=True)
df_sweet.Target.replace(to_replace=False, value='Bitter', inplace=True)

df = pd.DataFrame()
df = pd.concat([df_bitter, df_sweet])

In [None]:
# Convert SMILES into Canonical SMILES using RDKit.
length = df.shape[0]

name = []
smiles = []
canon_smiles = []
target = []
for i in range(length):
    try:
        c = Chem.CanonSmiles(str(df.iloc[i]['SMILES']))
        canon_smiles.append(c)
        smiles.append(str(df.iloc[i]['SMILES']))
        target.append(str(df.iloc[i]['Target']))
        name.append(str(df.iloc[i]['Name']))
    except:
        continue

dict_canon_smiles = {'Name':name, 'SMILES':smiles, 'Canonical SMILES':canon_smiles, 'Target':target}
df_canon_smiles = pd.DataFrame(data=dict_canon_smiles)
df_canon_smiles.to_pickle('../data/df_canon_smiles.pkl')

In [None]:
# Prepare for 2D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=True)
print "Total number of 2D descriptors:", len(calc.descriptors)
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print "Total number of molecules for descriptor calculation:", len(mols)

In [None]:
# Calculate 2D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols)
df_desc.to_pickle('../data/df_2d_descriptors.pkl.gz', compression='gzip')

In [None]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../data/bitter_sweet_2d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../data/bitter_sweet_2d_descriptors.pkl.gz', compression='gzip')

In [None]:
# Prepare for 2D + 3D descriptor calculation.
calc = Calculator(descriptors, ignore_3D=False)
print "Total number of 3D descriptors:", len(calc.descriptors)
mols = [Chem.MolFromSmiles(smi) for smi in list(df_canon_smiles['Canonical SMILES'])]
print "Total number of molecules for descriptor calculation:", len(mols)

In [None]:
# Calculate 2D descriptors and save them in a dataframe.
df_desc = calc.pandas(mols)
df_desc.to_pickle('../data/df_2d_plus_3d_descriptors.pkl.gz', compression='gzip')

In [None]:
# Consolidate name, SMILES, canonical SMILES, descriptors and target in the same dataframe.
# Save the dataframe as a compressed .tsv.
df_ = df_canon_smiles[['Name', 'SMILES', 'Canonical SMILES']]
df_save = pd.concat([df_, df_desc, df_canon_smiles[['Target']]], axis=1)
df_save.to_csv('../data/bitter_sweet_2d_plus_3d_descriptors.tsv.gz', sep='\t', compression='gzip')
df_save.to_pickle('../data/bitter_sweet_2d_plus_3d_descriptors.pkl.gz', compression='gzip')