In [1]:
from __future__ import print_function

import numpy as np
import pickle
import random
import tensorflow as tf
tf.__version__

'2.0.0'

In [2]:
from tensorflow.keras import backend as K
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers

from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import BatchNormalization, Add, Subtract, Concatenate, SpatialDropout1D
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, LSTM, Embedding, Bidirectional, Flatten
from tensorflow.keras.layers import Lambda

from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential

from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.python.keras.constraints import maxnorm
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras import regularizers


from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

from selectivnet_utils import *

In [3]:
class SnBilstm:
    def __init__(self, train=True, filename="weight-enigma.h5", coverage=0.8, alpha=0.5, baseline=False):
        self.lamda = coverage
        self.alpha = alpha
        self.mc_dropout_rate = K.variable(value=0)
        self.num_classes = 3
        self.weight_decay = 0.0005
        self.valid_per = 0.2
        
        self.x_shape = 100 #padding
        self._load_data()

        self.filename = filename

        self.model = self.build_model()
        if baseline:
            self.alpha = 0

        if train:
            self.model = self.train(self.model)
        else:
            self.model.load_weights("checkpoints/{}".format(self.filename))

    def build_model(self):
        # Build the network of vgg for 10 classes with massive dropout and weight decay as described in the paper.
        weight_decay = self.weight_decay
        basic_dropout_rate = 0.1
        input = Input(shape=self.x_shape)
        
        embedding_size = 128
        print("Vocab Size:{}".format(self.vocab_size))
        print("Embedding Size:{}".format(embedding_size))

        # Keras Embedding layer with Word2Vec weights initialization
        curr = Embedding(input_dim=self.vocab_size, output_dim=embedding_size)(input)
        curr = Bidirectional(LSTM(128, return_sequences=True, dropout=basic_dropout_rate + 0.2, recurrent_dropout=0.1, kernel_regularizer=regularizers.l2(weight_decay)))(curr)
        curr = Activation('relu')(curr)
        curr = BatchNormalization()(curr)
        
        curr = Bidirectional(LSTM(128, return_sequences=False, dropout=basic_dropout_rate + 0.2, recurrent_dropout=0.1, kernel_regularizer=regularizers.l2(weight_decay)))(curr)
        curr = Activation('relu')(curr)
        curr = BatchNormalization()(curr)

        
        curr = Dense(128, kernel_regularizer=regularizers.l2(weight_decay))(curr)
        curr = Activation('relu')(curr)
        curr = Dropout(basic_dropout_rate + 0.2)(curr)
        curr = BatchNormalization()(curr)
        
        curr = Dense(64, kernel_regularizer=regularizers.l2(weight_decay))(curr)
        curr = Activation('relu')(curr)
        curr = BatchNormalization()(curr)
        curr = Lambda(lambda x: K.dropout(x, level=self.mc_dropout_rate))(curr)
        
        # classification head (f)
        curr1 = Dense(self.num_classes, activation='softmax')(curr)

        # selection head (g)
        curr2 = Dense(64, kernel_regularizer=regularizers.l2(weight_decay))(curr)
        curr2 = Activation('relu')(curr2)
        curr2 = BatchNormalization()(curr2)
        # this normalization is identical to initialization of batchnorm gamma to 1/10
        curr2 = Lambda(lambda x: x / 10)(curr2)
        curr2 = Dense(1, activation='sigmoid')(curr2)
        selective_output = Concatenate(axis=1, name="selective_head")([curr1, curr2])

        # auxiliary head (h)
        auxiliary_output = Dense(self.num_classes, activation='softmax', name="classification_head")(curr)

        model = Model(inputs=input, outputs=[selective_output, auxiliary_output])

        self.input = input
        self.model_embeding = Model(inputs=input, outputs=curr)
        print(model.summary())
        return model

    def predict(self, x=None, batch_size=128):
        if x is None:
            x = self.x_test
        return self.model.predict(x, batch_size)

    def predict_embedding(self, x=None, batch_size=128):
        if x is None:
            x = self.x_test
        return self.model_embeding.predict(x, batch_size)

    def selective_risk_at_coverage(self, coverage, mc=False):
        _, pred = self.predict()

        if mc:
            sr = np.max(pred, 1)
        else:
            sr = self.mc_dropout()
        sr_sorted = np.sort(sr)
        threshold = sr_sorted[pred.shape[0] - int(coverage * pred.shape[0])]
        covered_idx = sr > threshold
        selective_acc = np.mean(np.argmax(pred[covered_idx], 1) == np.argmax(self.y_test[covered_idx], 1))
        return selective_acc

    def _load_data(self):
        with open('models/tokeniser_2020_7_14.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
            
        with open('train_data/tokenized_data_2020_7_14.pickle', 'rb') as handle:
            data = pickle.load(handle)
            
        self.vocab_size = len(tokenizer.word_index) + 1
        
        data_padded = []
        y = []
        for i,j in data:
            data_padded.append(i)
            y.append(j)
        y=np.array(y)
            
        data_padded = pad_sequences(data_padded, maxlen=self.x_shape, padding='pre', truncating='pre')
        
        total_samples = data_padded.shape[0]
        n_val = int((self.valid_per * total_samples) / 128)*128 + 128
        n_train = total_samples - n_val

        random_i = random.sample(range(total_samples), total_samples)
        self.x_train = data_padded[random_i[:n_train]]
        self.y_train = tf.keras.utils.to_categorical(y[random_i[:n_train]], self.num_classes + 1)
        self.x_test = data_padded[random_i[n_train:]]
        self.y_test = tf.keras.utils.to_categorical(y[random_i[n_train:]], self.num_classes + 1)
        print("Train Shapes - X: {} - Y: {}".format(self.x_train.shape, self.y_train.shape))
        print("Val Shapes - X: {} - Y: {}".format(self.x_test.shape, self.y_test.shape))

    def train(self, model):
        c = self.lamda
        lamda = 32

        def selective_loss(y_true, y_pred):
            loss = K.categorical_crossentropy(
                K.repeat_elements(y_pred[:, -1:], self.num_classes, axis=1) * y_true[:, :-1],
                y_pred[:, :-1]) + lamda * K.maximum(-K.mean(y_pred[:, -1]) + c, 0) ** 2
            return loss

        def selective_acc(y_true, y_pred):
            g = K.cast(K.greater(y_pred[:, -1], 0.5), K.floatx())
            temp1 = K.sum(
                (g) * K.cast(K.equal(K.argmax(y_true[:, :-1], axis=-1), K.argmax(y_pred[:, :-1], axis=-1)), K.floatx()))
            temp1 = temp1 / K.sum(g)
            return K.cast(temp1, K.floatx())

        def coverage(y_true, y_pred):
            g = K.cast(K.greater(y_pred[:, -1], 0.5), K.floatx())
            return K.mean(g)



        # training parameters
        batch_size = 128
        maxepoches = 10
        learning_rate = 0.1

        lr_decay = 1e-6

        lr_drop = 25

        def lr_scheduler(epoch):
            return learning_rate * (0.5 ** (epoch // lr_drop))

        reduce_lr = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)


        # optimization details
        sgd = optimizers.SGD(lr=learning_rate, decay=lr_decay, momentum=0.9, nesterov=True)

        model.compile(loss=[selective_loss, 'categorical_crossentropy'], loss_weights=[self.alpha, 1 - self.alpha],
                      optimizer=sgd, metrics=['accuracy', selective_acc, coverage], experimental_run_tf_function=False)
        
        historytemp = model.fit(self.x_train, [self.y_train, self.y_train[:, :-1]], batch_size=batch_size,
                                          epochs=maxepoches, callbacks=[reduce_lr],
                                          validation_data=(self.x_test, [self.y_test, self.y_test[:, :-1]]))

        with open("checkpoints/{}_history.pkl".format(self.filename[:-3]), 'wb') as handle:
            pickle.dump(historytemp.history, handle, protocol=pickle.HIGHEST_PROTOCOL)

        model.save_weights("checkpoints/{}".format(self.filename))

        return model



In [None]:
init_model = SnBilstm()

Train Shapes - X: (75036, 100) - Y: (75036, 4)
Val Shapes - X: (18816, 100) - Y: (18816, 4)
Vocab Size:114557
Embedding Size:128
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 128)     14663296    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 256)     263168      embedding[0][0]                  
__________________________________________________________________________________________________
activation (Activation)         (None, 100, 256)     0          