In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, multilabel_confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder



Load dataset

In [3]:
data = pd.read_csv('dataset.csv')

In [4]:
data.shape

(13821, 2)

In [4]:
data.head()

Unnamed: 0,smile,flavor
0,C1C(C(C(OC1O)CO)O)O,sweet
1,C(CC(=O)O)C(=O)C(=O)O,odorless
2,CCC(=O)C(=O)O,sweet
3,C1=NC2=C(C(=N1)N)N=CN2C3C(C(C(O3)COP(=O)(O)O)O...,sweet
4,C1=CC=C(C=C1)CCC(=O)O,sweet


In [5]:
max_l = 0
for column, row in data.iterrows():
    smiles = row['smile']
    l = len(smiles)
    if l > max_l:
        max_l = l
print(max_l)

801


Define functions for SMILES encoding and decoding.

In [6]:
# define SMILES characters ----------------------------------------------------
SMILES_CHARS = [' ',
                '#', '%', '(', ')', '+', '-', '.', '/',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                '=', '@',
                'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                'R', 'S', 'T', 'V', 'X', 'Z',
                '[', '\\', ']',
                'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                't', 'u']
                
# define encoder and decoder --------------------------------------------------
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

def smiles_encoder( smiles, maxlen=1000 ):
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X

def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi


See an example

In [7]:
example_smile = data['smile'][520]
print(example_smile)
print(len(example_smile))

example_encoding = smiles_encoder(example_smile)

print(example_encoding.shape)
print(example_encoding[0])

CCCOC1=C(C=C(C=C1)[N+](=O)[O-])N
32
(1000, 56)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


Create dataframe with one-hot encodings and flavors.

In [8]:
one_hot_df = pd.DataFrame(columns=['one_hot', 'flavor'])
for column, row in data.iterrows():
    smiles = row['smile']
    flavor = row['flavor']
    try:
        encoding = smiles_encoder(smiles)
        one_hot_df.loc[len(one_hot_df.index)] = [encoding, flavor]
    except:
        continue

one_hot_df.head()

Unnamed: 0,one_hot,flavor
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",sweet
1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",odorless
2,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",sweet
3,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",sweet
4,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",sweet


In [9]:
one_hot_df.shape

(13788, 2)

In [10]:
one_hot_df['flavor'].value_counts()

sweet       11677
bitter       1775
odorless      336
Name: flavor, dtype: int64

Define train and test sets.

In [None]:
# get 2/10 of each flavor as test set
test_df = pd.DataFrame(columns=['one_hot', 'flavor'])
for flavor in one_hot_df['flavor'].unique():
    flavor_df = one_hot_df[one_hot_df['flavor'] == flavor]
    test_len = int(len(flavor_df.index) * 0.2)
    test_df = pd.concat([test_df, flavor_df.sample(n=test_len)])

print(test_df.shape)
print(test_df['flavor'].value_counts())

# get rest of data for training
train_df = one_hot_df.drop(test_df.index)

# undersample sweet flavor rows in train_df
sweet_df = train_df[train_df['flavor'] == 'sweet']
train_df = train_df.drop(sweet_df.sample(n=5000).index)

# oversample bitter flavor rows in train_df
bitter_df = train_df[train_df['flavor'] == 'bitter']
for i in range(2):
    train_df = pd.concat([train_df, bitter_df])

# oversample odorless flavor rows in train_df
odorless_df = train_df[train_df['flavor'] == 'odorless']
for i in range(5):
    train_df = pd.concat([train_df, odorless_df])

# shuffle train_df
train_df = train_df.sample(frac=1).reset_index(drop=True)

print(train_df.shape)
print(train_df['flavor'].value_counts())

print(test_df.head())
print(train_df.head())

(2757, 2)
sweet       2335
bitter       355
odorless      67
Name: flavor, dtype: int64
(10216, 2)
sweet       4342
bitter      4260
odorless    1614
Name: flavor, dtype: int64
                                                 one_hot flavor
11178  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
7107   [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
12345  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
10339  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
8483   [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
                                             one_hot    flavor
0  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  odorless
1  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...    bitter
2  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...     sweet
3  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  odorless
4  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...    bitter


Transform sets to arrays.

In [None]:
le = LabelEncoder()
#le.fit(['odorless', 'bitter', 'sweet'])
X_train = np.array(train_df['one_hot'].tolist())
y_train = np.array(le.fit_transform(train_df['flavor'].tolist()))

X_test = np.array(test_df['one_hot'].tolist())
y_test = np.array(le.fit_transform(test_df['flavor'].tolist()))

3 dimensions to 2

In [None]:
nsamples, nx, ny = X_train.shape
print(nsamples, nx, ny)
X_train = X_train.reshape((nsamples,nx*ny))
print(X_train.shape)

10216 1000 56
(10216, 56000)


In [None]:
nsamples, nx, ny = X_test.shape
print(nsamples, nx, ny)
X_test = X_test.reshape((nsamples,nx*ny))
print(X_test.shape)

2757 1000 56
(2757, 56000)


### Random forest

In [None]:
random_forest = RFC(criterion='gini',
    n_estimators=100, 
    min_samples_split=2, 
    min_samples_leaf=1,
    max_depth=30,
    max_features='auto', 
    oob_score=True, 
    random_state=1,
    n_jobs=-1,
    verbose=1)


random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test) 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   48.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


Test performance

In [None]:
# View accuracy score
acc_random_forest = accuracy_score(y_test.reshape(-1, 1), y_pred) 
print('Accuracy: ', round(acc_random_forest*100, 2), '%')

# # View confusion matrix for test data and predictions
cm = multilabel_confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)
# multilabel_confusion_matrix = multilabel_confusion_matrix(y_test, y_pred)
# print('Multilabel Confusion Matrix:', multilabel_confusion_matrix)
print()
# View the classification report for test data and predictions
cr = classification_report(y_test, y_pred)
print('Classification Report:\n', cr)

Accuracy:  84.33 %
Confusion Matrix:
 [[[2060  342]
  [  58  297]]

 [[2664   26]
  [  48   19]]

 [[ 358   64]
  [ 326 2009]]]

Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.84      0.60       355
           1       0.42      0.28      0.34        67
           2       0.97      0.86      0.91      2335

    accuracy                           0.84      2757
   macro avg       0.62      0.66      0.62      2757
weighted avg       0.89      0.84      0.86      2757



### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [None]:
y_pred = neigh.predict(X_test)

In [None]:
# View accuracy score
acc_random_forest = accuracy_score(y_test.reshape(-1, 1), y_pred) 
print('Accuracy: ', round(acc_random_forest*100, 2), '%')

# # View confusion matrix for test data and predictions
cm = multilabel_confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)
# multilabel_confusion_matrix = multilabel_confusion_matrix(y_test, y_pred)
# print('Multilabel Confusion Matrix:', multilabel_confusion_matrix)
print()
# View the classification report for test data and predictions
cr = classification_report(y_test, y_pred)
print('Classification Report:\n', cr)

Accuracy:  83.68 %
Confusion Matrix:
 [[[2118  284]
  [  73  282]]

 [[2588  102]
  [  37   30]]

 [[ 358   64]
  [ 340 1995]]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.79      0.61       355
           1       0.23      0.45      0.30        67
           2       0.97      0.85      0.91      2335

    accuracy                           0.84      2757
   macro avg       0.56      0.70      0.61      2757
weighted avg       0.89      0.84      0.86      2757



## Additional 2 classes test:

In [11]:
one_hot_df = one_hot_df[one_hot_df['flavor'] != "odorless"]

In [12]:
one_hot_df['flavor'].value_counts()

sweet     11677
bitter     1775
Name: flavor, dtype: int64

Train and test df

In [13]:
# get 2/10 of each flavor as test set
test_df = pd.DataFrame(columns=['one_hot', 'flavor'])
for flavor in one_hot_df['flavor'].unique():
    flavor_df = one_hot_df[one_hot_df['flavor'] == flavor]
    test_len = int(len(flavor_df.index) * 0.2)
    test_df = pd.concat([test_df, flavor_df.sample(n=test_len)])

print(test_df.shape)
print(test_df['flavor'].value_counts())

# get rest of data for training
train_df = one_hot_df.drop(test_df.index)

# undersample sweet flavor rows in train_df
sweet_df = train_df[train_df['flavor'] == 'sweet']
train_df = train_df.drop(sweet_df.sample(n=5000).index)

# oversample bitter flavor rows in train_df
bitter_df = train_df[train_df['flavor'] == 'bitter']
for i in range(2):
    train_df = pd.concat([train_df, bitter_df])

# oversample odorless flavor rows in train_df
odorless_df = train_df[train_df['flavor'] == 'odorless']
for i in range(5):
    train_df = pd.concat([train_df, odorless_df])

# shuffle train_df
train_df = train_df.sample(frac=1).reset_index(drop=True)

print(train_df.shape)
print(train_df['flavor'].value_counts())

print(test_df.head())
print(train_df.head())

(2690, 2)
sweet     2335
bitter     355
Name: flavor, dtype: int64
(8602, 2)
sweet     4342
bitter    4260
Name: flavor, dtype: int64
                                                 one_hot flavor
3887   [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
11723  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
3823   [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
1979   [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
1374   [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  sweet
                                             one_hot  flavor
0  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   sweet
1  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   sweet
2  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  bitter
3  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   sweet
4  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...  bitter


In [14]:
le = LabelEncoder()
#le.fit(['odorless', 'bitter', 'sweet'])
X_train = np.array(train_df['one_hot'].tolist())
y_train = np.array(le.fit_transform(train_df['flavor'].tolist()))

X_test = np.array(test_df['one_hot'].tolist())
y_test = np.array(le.fit_transform(test_df['flavor'].tolist()))

In [15]:
nsamples, nx, ny = X_train.shape
print(nsamples, nx, ny)
X_train = X_train.reshape((nsamples,nx*ny))
print(X_train.shape)

8602 1000 56
(8602, 56000)


In [16]:
nsamples, nx, ny = X_test.shape
print(nsamples, nx, ny)
X_test = X_test.reshape((nsamples,nx*ny))
print(X_test.shape)

2690 1000 56
(2690, 56000)


## Random Forest Classifier

In [17]:
random_forest = RFC(criterion='gini',
    n_estimators=100, 
    min_samples_split=2, 
    min_samples_leaf=1,
    max_depth=30,
    max_features='auto', 
    oob_score=True, 
    random_state=1,
    n_jobs=-1,
    verbose=1)


random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test) 

  warn(
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.4s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 100 out of 100 | elapsed:    0.0s finished


In [18]:
# View accuracy score
acc_random_forest = accuracy_score(y_test.reshape(-1, 1), y_pred) 
print('Accuracy: ', round(acc_random_forest*100, 2), '%')

# # View confusion matrix for test data and predictions
cm = multilabel_confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)
# multilabel_confusion_matrix = multilabel_confusion_matrix(y_test, y_pred)
# print('Multilabel Confusion Matrix:', multilabel_confusion_matrix)
print()
# View the classification report for test data and predictions
cr = classification_report(y_test, y_pred)
print('Classification Report:\n', cr)

Accuracy:  86.25 %
Confusion Matrix:
 [[[2019  316]
  [  54  301]]

 [[ 301   54]
  [ 316 2019]]]

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.85      0.62       355
           1       0.97      0.86      0.92      2335

    accuracy                           0.86      2690
   macro avg       0.73      0.86      0.77      2690
weighted avg       0.91      0.86      0.88      2690



## KNClassifier

In [19]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)

In [20]:
# View accuracy score
acc_random_forest = accuracy_score(y_test.reshape(-1, 1), y_pred) 
print('Accuracy: ', round(acc_random_forest*100, 2), '%')

# # View confusion matrix for test data and predictions
cm = multilabel_confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)
# multilabel_confusion_matrix = multilabel_confusion_matrix(y_test, y_pred)
# print('Multilabel Confusion Matrix:', multilabel_confusion_matrix)
print()
# View the classification report for test data and predictions
cr = classification_report(y_test, y_pred)
print('Classification Report:\n', cr)

Accuracy:  87.43 %
Confusion Matrix:
 [[[2057  278]
  [  60  295]]

 [[ 295   60]
  [ 278 2057]]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.83      0.64       355
           1       0.97      0.88      0.92      2335

    accuracy                           0.87      2690
   macro avg       0.74      0.86      0.78      2690
weighted avg       0.91      0.87      0.89      2690

