# Library Imports

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import io
# from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from src.helper_functions import load_data

In [2]:
import keras
from keras.callbacks import Callback
from keras.layers import Dense, Activation, Dropout
keras.__version__

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


'2.2.5'

# Load Data

In [3]:
x_tr, y_tr, x_te, y_te = load_data()

# Choose A Target

In [4]:
y_tr.columns

Index(['NR.AhR', 'NR.AR', 'NR.AR.LBD', 'NR.Aromatase', 'NR.ER', 'NR.ER.LBD',
       'NR.PPAR.gamma', 'SR.ARE', 'SR.ATAD5', 'SR.HSE', 'SR.MMP', 'SR.p53'],
      dtype='object')

The Random Forest example loops through all the targets.  I'll pick only the first one for the DNN MVP:

In [8]:
# for target in y_tr.columns:
target = 'NR.AhR'
rows_tr = np.isfinite(y_tr[target]).values
rows_te = np.isfinite(y_te[target]).values
x,y = x_tr[rows_tr], y_tr[target][rows_tr]
x.shape

(8441, 1644)

# Address Class Imbalance
Oversampling Documentation:
https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html

"While the `RandomOverSampler` is over-sampling by duplicating some of the original samples of the minority class, `SMOTE` and `ADASYN` generate new samples in by interpolation. However, the samples used to interpolate/generate new synthetic samples differ. In fact, `ADASYN` focuses on generating samples next to the original samples which are wrongly classified using a k-Nearest Neighbors classifier while the basic implementation of `SMOTE` will not make any distinction between easy and hard samples to be classified using the nearest neighbors rule. Therefore, the decision function found during training will be different among the algorithms."

**I decided that over-sampling using synthetic methods is probably not legitimate because it is creating new "samples", i.e. chemicals with properties (feature values) that do not represent real chemical structures.  Though I tried using SMOTE and got reasonably similar results, I think the approach is technically dubious.**

In [9]:
from imblearn.over_sampling import RandomOverSampler #, SMOTE, ADASYN

In [10]:
y.value_counts()

0.0    7460
1.0     981
Name: NR.AhR, dtype: int64

To keep the class proportions the same use the stratify parameter: [source](https://stats.stackexchange.com/questions/394056/splitting-into-train-and-test-sets-keeping-class-proportions)

In [11]:
x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

In [12]:
y_val.value_counts()

0.0    1493
1.0     196
Name: NR.AhR, dtype: int64

In [13]:
ros = RandomOverSampler(random_state=0)
# ros = SMOTE(random_state=42)   # See comment above - I don't believe using SMOTE is legitimate.
x_resampled, y_resampled = ros.fit_sample(x_train,y_train)

In [14]:
pd.Series(y_resampled).value_counts()

1.0    5967
0.0    5967
dtype: int64

In [15]:
x_resampled.shape

(11934, 1644)

# Build Neural Network

Following the desciption in section 2.2.4 of the [DeepTox article](https://www.frontiersin.org/articles/10.3389/fenvs.2015.00080/full), I tried to use intermediate values in [Table 2](https://www.frontiersin.org/articles/10.3389/fenvs.2015.00080/full#T2) to build the neural network:

Following [this question/answer](https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model) to implement usage of recall in model training:

In [13]:
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

In [14]:
drop_out=0.5    # DeepTox range: 0.5, 0.2, 0
L2_reg = 0.0001 # Default = 0.01
layers = 3      # DeepTox range: 1, 2, 3, 4
act = 'sigmoid' # Consider sigmoid and tanh
neurons = 1024  # DeepTox range: 1024, 2048, 4096, 8192, 16384
decay = 0       # DeepTox range: 10^-4, 10^-5, 10^-6
learn_rate = 0.1  #Research appropriate range
DNN = keras.Sequential([
    keras.layers.InputLayer(input_shape=x.shape[1:],name='Input_Layer')
])
for i in range(1,layers+1):
    DNN.add(Dense(units=neurons, activation=act,\
                  name='h'+str(i)+'_'+act+'_activation',\
                  kernel_regularizer=keras.regularizers.l2(L2_reg)))
    DNN.add(Dropout(rate=drop_out,name='Dropout'+str(i)))
DNN.add(Dense(units=1, activation='sigmoid'))
keras.optimizers.Adam(lr=learn_rate, beta_1=0.9,\
                      beta_2=0.999, decay=decay, amsgrad=False)
# Info on decay: https://datascience.stackexchange.com/questions/26112/decay-parameter-in-keras-optimizers
DNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',recall_m])
DNN.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
h1_sigmoid_activation (Dense (None, 1024)              1684480   
_________________________________________________________________
Dropout1 (Dropout)           (None, 1024)              0         
_________________________________________________________________
h2_sigmoid_activation (Dense (None, 1024)              1049600   
_________________________________________________________________
Dropout2 (Dropout)           (None, 1024)              0         
_________________________________________________________________
h3_sigmoid_activation (Dense (None, 1024)              1049600   
_________________________________________________________________

In [15]:
DNN.fit(
    x_resampled, y_resampled, batch_size=512, epochs=100,\
    validation_data=(x_val,y_val), verbose=1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='recall_m',mode='max',\
                                      patience=16,verbose=1,\
                                      restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5,patience=3,verbose=1)
    ])

Instructions for updating:
Use tf.cast instead.
Train on 11934 samples, validate on 1689 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 19/100
Epoch 20/100
Epoch 21/100

Epoch 00021: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 22/100
Epoch 23/100
Epoch 24/100

Epoch 00024: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 25/100
Epoch 26/100
Epoch 27/100

Epoch 00027: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: ReduceLROnPlateau redu

Epoch 41/100
Epoch 42/100

Epoch 00042: ReduceLROnPlateau reducing learning rate to 4.882812731921149e-07.
Epoch 43/100
Epoch 44/100
Epoch 45/100

Epoch 00045: ReduceLROnPlateau reducing learning rate to 2.4414063659605745e-07.
Epoch 46/100
Epoch 47/100
Epoch 48/100

Epoch 00048: ReduceLROnPlateau reducing learning rate to 1.2207031829802872e-07.
Epoch 49/100
Epoch 50/100
Epoch 51/100

Epoch 00051: ReduceLROnPlateau reducing learning rate to 6.103515914901436e-08.
Epoch 52/100
Restoring model weights from the end of the best epoch
Epoch 00052: early stopping


<keras.callbacks.History at 0x12a956f98>

In [16]:
auc_te = roc_auc_score(y_te[target][rows_te], DNN.predict(x_te[rows_te]))
print("%15s: %3.5f" % (target, auc_te))

         NR.AhR: 0.86810


In [17]:
y_testing=y_te[target][~np.isnan(y_te[target])]
y_hat_testing=DNN.predict_classes(x_te[rows_te])
print(np.array([['TN','FP'],['FN','TP']]))
print(confusion_matrix(y_testing,y_hat_testing))

[['TN' 'FP']
 ['FN' 'TP']]
[[437 100]
 [ 16  57]]


In [18]:
print('f1:',f1_score(y_testing,y_hat_testing))
print('recall:',recall_score(y_testing,y_hat_testing))
print('precision:',precision_score(y_testing,y_hat_testing))

f1: 0.49565217391304345
recall: 0.7808219178082192
precision: 0.3630573248407643


In [19]:
y_te[target][rows_te].value_counts()

0.0    537
1.0     73
Name: NR.AhR, dtype: int64

In [20]:
537/(537+73)

0.8803278688524591

Uncomment to save model.  Last ROC_UAC = 0.86068

In [21]:
# DNN.save('./models/first_model.h5')