In [2]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np

### First dataset

In [3]:
data = pd.read_csv('molecules.csv')

In [4]:
data.shape

(25595, 43)

Leave only column with SMILE and flavor.

In [5]:
clean_columns = ['smile', 'flavor_profile']
data = data[clean_columns]
data.shape

(25595, 2)

Drop NaNs

In [6]:
data = data.dropna()
data.shape

(25106, 2)

In [7]:
data.head()

Unnamed: 0,smile,flavor_profile
0,CC(CN)O,fishy
1,C1C(C(C(OC1O)CO)O)O,sweet
3,CC(C)C(=O)C(=O)O,fruity
4,C(CC(=O)O)C(=O)C(=O)O,odorless
5,CCC(=O)C(=O)O,brown@caramel@lactonic@sweet@creamy


In [9]:
data.flavor_profile.value_counts()

sweet-like                                                        13782
sweet                                                              8172
bitter                                                              603
odorless                                                            120
fruity                                                               31
                                                                  ...  
butter@green bean@vegetable@earthy@beany@fruity@metallic@green        1
acrylic@vegetable@pungent                                             1
sulfur@cabbage@sulfurous@onion@fish@savory@meaty@cooked               1
caramel@strawberry@grape@cotton@sweet@sugar@candy@almond              1
wood@woody@green                                                      1
Name: flavor_profile, Length: 2101, dtype: int64

Leave only 3 most common flavors: sweet, bitter and odorless. If one of these flavors is a part of the multiflavor, change the multiflavor into this flavor.

In [10]:
db2 = pd.DataFrame(columns=['smile', 'flavor'])
for index, row in data.iterrows():
    if 'odorless' in row['flavor_profile']:
        db2.loc[len(db2.index)] = [row['smile'], 'odorless']
    elif 'bitter' in row['flavor_profile']:
        db2.loc[len(db2.index)] = [row['smile'], 'bitter']
    elif 'sweet' in row['flavor_profile']:
        db2.loc[len(db2.index)] = [row['smile'], 'sweet']

In [11]:
db2.shape

(23418, 2)

In [12]:
db2.head()

Unnamed: 0,smile,flavor
0,C1C(C(C(OC1O)CO)O)O,sweet
1,C(CC(=O)O)C(=O)C(=O)O,odorless
2,CCC(=O)C(=O)O,sweet
3,C1=NC2=C(C(=N1)N)N=CN2C3C(C(C(O3)COP(=O)(O)O)O...,sweet
4,C1=CC=C(C=C1)CCC(=O)O,sweet


In [13]:
db2.flavor.value_counts()

sweet       22527
bitter        728
odorless      163
Name: flavor, dtype: int64

### Second dataset

In [14]:
chem_data = pd.read_excel('ChemTastesDB_database.xlsx')

In [15]:
chem_data.shape

(2947, 8)

Leave only SMILES and taste, drop Nans.

In [16]:
chem_data = chem_data[['canonical SMILES', 'Class taste']]
chem_data = chem_data.dropna()
chem_data.shape

(2944, 2)

In [17]:
chem_data.head()

Unnamed: 0,canonical SMILES,Class taste
0,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweetness
1,CC(C)=CCCC(C)(O)C1CC(O)C(=CC1=O)C,Sweetness
2,CC(=O)OC1C(Oc2cc(cc(c2C1=O)O)O)c1ccc(c(c1)O)O,Sweetness
3,Oc1cc2c(cc1O)C1c3ccc(c(c3OCC1(O)C2)O)O,Sweetness
4,OC1C(O)C(O)C(O)C(O)C1O,Sweetness


In [18]:
chem_data['Class taste'].value_counts()

Bitterness       1183
Sweetness         977
Non-sweetness     233
Tastelessness     203
Multitaste        113
Umaminess          98
Miscellaneous      87
Sourness           38
Saltiness          12
Name: Class taste, dtype: int64

Leave only three tastes - the same as previously. Append these rows to the first dataset.

In [19]:
for index, row in chem_data.iterrows():
    if 'Tastelessness' in row['Class taste']:
        db2.loc[len(db2.index)] = [row['canonical SMILES'], 'odorless']
    elif 'Bitterness' in row['Class taste']:
        db2.loc[len(db2.index)] = [row['canonical SMILES'], 'bitter']
    elif 'Sweetness' in row['Class taste']:
        db2.loc[len(db2.index)] = [row['canonical SMILES'], 'sweet']

In [20]:
db2.shape

(25781, 2)

In [21]:
len(db2.smile.unique())

13821

In [22]:
db2.flavor.value_counts()

sweet       23504
bitter       1911
odorless      366
Name: flavor, dtype: int64

Drop duplicated molecules

In [23]:
db2_unique = db2.drop_duplicates(subset=['smile'])
db2_unique.shape

(13821, 2)

In [24]:
db2_unique.flavor.value_counts()

sweet       11710
bitter       1775
odorless      336
Name: flavor, dtype: int64

Save final dataset

In [25]:
db2_unique.to_csv('dataset.csv', index=False)