# Library Imports

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import io
# from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [2]:
import keras
from keras.callbacks import Callback
keras.__version__

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


'2.2.5'

class Metrics(Callback):
    def on_train_begin(self, logs={}):
         self.val_f1s = []
         self.val_recalls = []
         self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
         val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
         val_targ = self.model.validation_data[1]
         _val_f1 = f1_score(val_targ, val_predict)
         _val_recall = recall_score(val_targ, val_predict)
         _val_precision = precision_score(val_targ, val_predict)
         self.val_f1s.append(_val_f1)
         self.val_recalls.append(_val_recall)
         self.val_precisions.append(_val_precision)
         print(' — val_f1: {0:f} — val_precision: {1:f} — val_recall {2:f}'.format(_val_f1, _val_precision, _val_recall))
         return

metrics = Metrics()

In [3]:
os.chdir('/Users/sean/CloudStation/Metis/projects/project5')
!pwd

/Users/sean/CloudStation/Metis/projects/project5


# Load Data

In [4]:
# load data
raw_data = './data/raw/tox21/'
y_tr = pd.read_csv(raw_data+'tox21_labels_train.csv.gz', index_col=0, compression="gzip")
y_te = pd.read_csv(raw_data+'tox21_labels_test.csv.gz', index_col=0, compression="gzip")
x_tr_dense = pd.read_csv(raw_data+'tox21_dense_train.csv.gz', index_col=0, compression="gzip").values
x_te_dense = pd.read_csv(raw_data+'tox21_dense_test.csv.gz', index_col=0, compression="gzip").values
x_tr_sparse = io.mmread(raw_data+'tox21_sparse_train.mtx.gz').tocsc()
x_te_sparse = io.mmread(raw_data+'tox21_sparse_test.mtx.gz').tocsc()
# filter out very sparse features
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
x_tr = np.hstack([x_tr_dense, x_tr_sparse[:, sparse_col_idx].A])
x_te = np.hstack([x_te_dense, x_te_sparse[:, sparse_col_idx].A])

# Choose A Target

In [5]:
y_tr.columns

Index(['NR.AhR', 'NR.AR', 'NR.AR.LBD', 'NR.Aromatase', 'NR.ER', 'NR.ER.LBD',
       'NR.PPAR.gamma', 'SR.ARE', 'SR.ATAD5', 'SR.HSE', 'SR.MMP', 'SR.p53'],
      dtype='object')

The Random Forest example loops through all the targets.  I'll pick only the first one for the DNN MVP:

In [6]:
# for target in y_tr.columns:
target = 'NR.AhR'
rows_tr = np.isfinite(y_tr[target]).values
rows_te = np.isfinite(y_te[target]).values
x,y = x_tr[rows_tr], y_tr[target][rows_tr]
x.shape

(8441, 1644)

# Address Class Imbalance
Oversampling Documentation:
https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html

"While the `RandomOverSampler` is over-sampling by duplicating some of the original samples of the minority class, `SMOTE` and `ADASYN` generate new samples in by interpolation. However, the samples used to interpolate/generate new synthetic samples differ. In fact, `ADASYN` focuses on generating samples next to the original samples which are wrongly classified using a k-Nearest Neighbors classifier while the basic implementation of `SMOTE` will not make any distinction between easy and hard samples to be classified using the nearest neighbors rule. Therefore, the decision function found during training will be different among the algorithms."

## TODO
I need to upsample the **training** data **only** without the validation portion!

In [7]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

In [8]:
y.value_counts()

0.0    7460
1.0     981
Name: NR.AhR, dtype: int64

To keep the class proportions the same use the stratify parameter: [source](https://stats.stackexchange.com/questions/394056/splitting-into-train-and-test-sets-keeping-class-proportions)

In [9]:
x_train, x_val, y_train, y_val = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

In [10]:
y_val.value_counts()

0.0    1493
1.0     196
Name: NR.AhR, dtype: int64

In [11]:
ros = RandomOverSampler(random_state=0)
x_resampled, y_resampled = ros.fit_sample(x_train,y_train)

In [12]:
pd.Series(y_resampled).value_counts()

1.0    5967
0.0    5967
dtype: int64

In [13]:
x_resampled.shape

(11934, 1644)

# Build Neural Network

Following the desciption in section 2.2.4 of the [DeepTox article](https://www.frontiersin.org/articles/10.3389/fenvs.2015.00080/full), I tried to use intermediate values in [Table 2](https://www.frontiersin.org/articles/10.3389/fenvs.2015.00080/full#T2) to build the neural network:

In [14]:
def f1(y_true, y_pred):
    return f1_score(y_true,y_pred)

In [15]:
drop_out=0.5
L2_reg = 0.0001
model = keras.Sequential([
    keras.layers.InputLayer(input_shape=x.shape[1:],name='Input_Layer'),
    keras.layers.Dense(units=1024, activation="sigmoid",name='h1_sigmoid_activation',\
                       kernel_regularizer=keras.regularizers.l2(L2_reg)),
    keras.layers.Dropout(rate=drop_out,name='Dropout1'),
    keras.layers.Dense(units=1024, activation="sigmoid",name='h2_sigmoid_activation',\
                       kernel_regularizer=keras.regularizers.l2(L2_reg)),
    keras.layers.Dropout(rate=drop_out,name='Dropout2'),
    keras.layers.Dense(units=1024, activation="sigmoid",name='h3_sigmoid_activation',\
                       kernel_regularizer=keras.regularizers.l2(L2_reg)),
    keras.layers.Dropout(rate=drop_out,name='Dropout3'),
    keras.layers.Dense(units=1, activation='sigmoid')
])
keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
h1_sigmoid_activation (Dense (None, 1024)              1684480   
_________________________________________________________________
Dropout1 (Dropout)           (None, 1024)              0         
_________________________________________________________________
h2_sigmoid_activation (Dense (None, 1024)              1049600   
_________________________________________________________________
Dropout2 (Dropout)           (None, 1024)              0         
_________________________________________________________________
h3_sigmoid_activation (Dense (None, 1024)              1049600   
_________________________________________________________________

In [16]:
model.fit(
    x_resampled, y_resampled, batch_size=512, epochs=100,\
    validation_data=(x_val,y_val), verbose=1,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=16,verbose=1, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5,patience=3,verbose=1)
    ])

Instructions for updating:
Use tf.cast instead.
Train on 11934 samples, validate on 1689 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 16/100
Epoch 17/100
Epoch 18/100

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 19/100
Epoch 20/100
Epoch 21/100

Epoch 00021: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 22/100
Epoch 23/100
Epoch 24/100

Epoch 00024: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 25/100
Epoch 26/100
Epoch 27/100

Epoch 00027: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 28/100
Restoring model weights from the end of the best epoch
Epoch 0

<keras.callbacks.History at 0x132ef4e10>

In [17]:
auc_te = roc_auc_score(y_te[target][rows_te], model.predict(x_te[rows_te]))
print("%15s: %3.5f" % (target, auc_te))

         NR.AhR: 0.85897


Well, that's about as bad of a prediction as you can get!

In [18]:
y_te[target][rows_te].value_counts()

0.0    537
1.0     73
Name: NR.AhR, dtype: int64

In [19]:
537/(537+73)

0.8803278688524591

In [20]:
unique_elements, counts_elements = np.unique(model.predict_classes(x_te[rows_te]), return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

[[  0   1]
 [472 138]]


In [21]:
confusion_matrix(y_te[target][~np.isnan(y_te[target])],model.predict_classes(x_te[rows_te]))

array([[452,  85],
       [ 20,  53]])

Uncomment to save model.  Last ROC_UAC = 0.85897

In [23]:
# model.save('./models/saves/first_model.h5')