In [1]:
import tensorflow as tf
import numpy as np

from tensorflow import keras
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.utils import class_weight

from gmc_loss import GmcLoss, get_coexist_counts
from data_handling import load_user_data, split_features_labels, \
    user_train_test_split

In [2]:
def sensitivity(y_true, y_pred):
    # mask where y_true is nan
    is_not_nan = tf.logical_not(tf.math.is_nan(y_true))
    y_true = tf.boolean_mask(y_true, is_not_nan)
    y_pred = tf.boolean_mask(y_pred, is_not_nan)
    # return true positive ratio
    true_positives = keras.backend.sum(tf.math.round(keras.backend.clip(
        y_true * y_pred, 0, 1)))
    possible_positives = keras.backend.sum(tf.math.round(keras.backend.clip(
        y_true, 0, 1)))
    return true_positives / (possible_positives + keras.backend.epsilon())


def specificity(y_true, y_pred):
    # mask where y_true is nan
    is_not_nan = tf.logical_not(tf.math.is_nan(y_true))
    y_true = tf.boolean_mask(y_true, is_not_nan)
    y_pred = tf.boolean_mask(y_pred, is_not_nan)
    # return true negative ratio
    true_negatives = keras.backend.sum(tf.math.round(keras.backend.clip(
        (1-y_true) * (1-y_pred), 0, 1)))
    possible_negatives = keras.backend.sum(tf.math.round(keras.backend.clip(
        1-y_true, 0, 1)))
    return true_negatives / (possible_negatives + keras.backend.epsilon())

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
keras.backend.set_floatx('float32')

# load data and reset index
data = load_user_data("/home/joschi/Documents/Studium/SS19/mu_practical_work/data")
data.reset_index(inplace=True)
X, y = split_features_labels(data)
attrs = list(X.index)
labels = list(y.index)
X = X.values
y = y.values.astype(np.float32)

In [4]:
X_train, X_test, y_train, y_test = user_train_test_split(X, y,
                                                         test_size=0.2,
                                                         random_state=42)
X_train, X_valid, y_train, y_valid = user_train_test_split(X_train,
                                                           y_train,
                                                           test_size=0.25,
                                                           random_state=42)

In [5]:
# drop uuid column, the timestamps, and the label source
X_train = np.delete(X_train, [0, 1, 2, X_train.shape[1] - 1], 1)
X_valid = np.delete(X_valid, [0, 1, 2, X_train.shape[1] - 1], 1)
X_test = np.delete(X_test, [0, 1, 2, X_test.shape[1] - 1], 1)

preprocess_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('std_scaler', StandardScaler()),
])
X_train = preprocess_pipeline.fit_transform(X_train)
X_valid = preprocess_pipeline.transform(X_valid)
X_test = preprocess_pipeline.transform(X_test)

In [6]:
is_nan = np.isnan(y_train)
y_train_clean = y_train[~is_nan]
class_weights = class_weight.compute_class_weight("balanced",
                                                  np.unique(y_train_clean),
                                                  y_train_clean)

In [7]:
X_train = X_train.astype(np.float32)
X_valid = X_valid.astype(np.float32)
X_test = X_test.astype(np.float32)

In [44]:
model = keras.models.Sequential([
    keras.layers.AlphaDropout(rate=0.2, 
                              input_shape=X_train.shape[1:]),
    keras.layers.Dense(150, activation="selu",
                       kernel_initializer="lecun_normal"),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(100, activation="selu",
                      kernel_initializer="lecun_normal"),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(100, activation="selu",
                      kernel_initializer="lecun_normal"),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(y_train.shape[1], activation="sigmoid",
                       kernel_initializer="glorot_uniform")
])

gmc_loss = GmcLoss(y_train, alpha=1e-4)
optimizer = keras.optimizers.Nadam(lr=0.001)

model.compile(loss=gmc_loss, optimizer=optimizer,
              metrics=[specificity, sensitivity])

In [45]:
lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,
                                                  restore_best_weights=True)
history = model.fit(X_train, y_train, 
                    class_weight=class_weights,
                    epochs=100, batch_size=1024,
                    validation_data=(X_valid, y_valid),
                    callbacks=[lr_scheduler, early_stopping_cb])

Train on 184596 samples, validate on 46149 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100


Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [48]:
y_pred = model.predict(X_test)
print(y_pred[0])
y_pred = np.round(y_pred)
y_pred_bias = np.round(model.predict(X_train))

[9.3058193e-01 9.9833846e-01 4.1857362e-04 0.0000000e+00 3.2782555e-07
 9.2885876e-01 7.9948342e-01 8.5069448e-02 5.5083036e-03 9.9966609e-01
 9.9999911e-01 2.7579069e-04 2.7418137e-06 1.1920929e-07 8.9406967e-08
 2.0861626e-07 9.3445063e-01 4.0531158e-06 9.8754358e-01 7.7486038e-07
 4.0233135e-06 1.4901161e-07 0.0000000e+00 5.9604645e-08 4.8875809e-06
 1.6093254e-06 0.0000000e+00 0.0000000e+00 9.7705162e-01 7.6503438e-01
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 8.0129993e-01
 9.5763600e-01 3.8137794e-01 1.4853477e-04 2.7716160e-06 8.9406967e-06
 0.0000000e+00 8.3446503e-07 7.4505806e-07 0.0000000e+00 7.0060933e-01
 9.8983657e-01 3.7070394e-01 1.1479259e-03 9.9622166e-01 5.7381392e-04
 7.4502480e-01]


In [49]:
y_test[0]

array([ 0.,  1.,  0., nan, nan,  0., nan,  0.,  1.,  1., nan, nan,  0.,
        0.,  0., nan,  0.,  0.,  0., nan,  0.,  0., nan, nan, nan, nan,
       nan,  0., nan, nan, nan, nan, nan,  0.,  1., nan,  0.,  0., nan,
       nan, nan, nan, nan, nan,  0.,  1., nan,  0.,  1.,  1.,  1.],
      dtype=float32)

In [47]:
from metrics import balanced_accuracy_score

ba_score = balanced_accuracy_score(y_test.T, y_pred.T, average="macro")
print("Balanced accuracy: {:.2f}".format(ba_score))
ba_bias_score = balanced_accuracy_score(y_train.T, y_pred_bias.T, average="macro")
print("Balanced accuracy bias: {:.2f}".format(ba_bias_score))

0 
 [[    0 36867]
 [    0 20690]]
1 
 [[    2 32327]
 [   20 25208]]
2 
 [[52817  1042]
 [ 2130  1568]]
3 
 [[32172    67]
 [  272     3]]
4 
 [[21469    96]
 [  218   147]]
5 
 [[    0 37409]
 [    0 14551]]
6 
 [[4757   16]
 [  21    0]]
7 
 [[25119    22]
 [ 1229    27]]
8 
 [[51507   107]
 [ 1693    20]]
9 
 [[11037 37910]
 [ 1018 11295]]
10 
 [[    0  3102]
 [    0 32464]]
11 
 [[24815  4026]
 [  604  2498]]
12 
 [[40741   363]
 [ 1201   596]]
13 
 [[31092     0]
 [  251     0]]
14 
 [[35869   288]
 [ 1513   930]]
15 
 [[27002   505]
 [  192    27]]
16 
 [[    1 42980]
 [    0 26413]]
17 
 [[25332     3]
 [  396     3]]
18 
 [[ 3394 26120]
 [   68  4787]]
19 
 [[48696   337]
 [  712   225]]
20 
 [[48812     0]
 [  985     0]]
21 
 [[30387     0]
 [  422     0]]
22 
 [[13114     0]
 [   79     0]]
23 
 [[2653    0]
 [  28    0]]
24 
 [[34987     0]
 [  471     0]]
25 
 [[27739     0]
 [  518     0]]
26 
 [[16158     0]
 [  121     0]]
27 
 [[30978     0]
 [  603     0]]
28 
 [[172