In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import tensorflow_datasets as tfds

In [0]:
tf.random.set_seed(1228)
np.random.seed(1228)

In [4]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=80000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [5]:
word_index = tf.keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [6]:
len(word_index)

88584

In [0]:
def length_check(x):
  return max([len(i) for i in x])

In [8]:
length_check(x_train)

2494

In [9]:
maxlength = length_check(x_train)

maxlength

2494

In [0]:
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, 1000, padding='post')
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, 1000, padding='post')

In [0]:
class layer_parameters:
  kernel_initializer = tf.keras.initializers.GlorotNormal(seed=1228)


layer_param = layer_parameters()

In [0]:
class My_Attention(tf.keras.layers.Layer):
  def __init__(self, units, **kwargs):
    super(My_Attention, self).__init__(**kwargs)
    self.units = units

    self.w = tf.keras.layers.Dense(units, kernel_initializer=layer_param.kernel_initializer)
    self.u_w = tf.keras.layers.Dense(1, kernel_initializer=layer_param.kernel_initializer, use_bias=False)
    

  def call(self, inputs):
    uit = self.w(inputs)
    uit = tf.nn.tanh(uit)
    ait = tf.nn.softmax(self.u_w(uit), axis=1)
    si = tf.reduce_sum(ait * inputs, axis=1)
    return si, ait


In [0]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if tf.keras.backend.backend() == 'tensorflow':
        return tf.keras.backend.squeeze(tf.keras.backend.dot(x, tf.keras.backend.expand_dims(kernel)), axis=-1)
    else:
        return tf.keras.backend.dot(x, kernel)

class AttentionWithContext(tf.keras.layers.Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = layer_param.kernel_initializer

        self.W_regularizer = tf.keras.regularizers.get(W_regularizer)
        self.u_regularizer = tf.keras.regularizers.get(u_regularizer)
        self.b_regularizer = tf.keras.regularizers.get(b_regularizer)

        self.W_constraint = tf.keras.constraints.get(W_constraint)
        self.u_constraint = tf.keras.constraints.get(u_constraint)
        self.b_constraint = tf.keras.constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='W',
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='b',
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='u',
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = tf.keras.backend.tanh(uit)
        ait = dot_product(uit, self.u)

        a = tf.keras.backend.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= tf.keras.backend.cast(mask, tf.keras.backend.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= tf.keras.backend.cast(tf.keras.backend.sum(a, axis=1, keepdims=True) + tf.keras.backend.epsilon(), tf.keras.backend.floatx())

        a = tf.keras.backend.expand_dims(a)
        weighted_input = x * a
        return tf.keras.backend.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [0]:
len(word_index)

88584

In [0]:
text_input = tf.keras.layers.Input(shape=(1000,))
emb = tf.keras.layers.Embedding(80000+1, 256)(text_input)
bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(emb)

my_attention = My_Attention(512)

matt, weight = my_attention(bi_lstm)
or_attention = AttentionWithContext()(bi_lstm)
plane = tf.keras.layers.Flatten()(bi_lstm)

ma_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Brief_Att')(matt)
or_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Full_Att')(or_attention)
pl_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Plane_No_Att')(plane)


model = tf.keras.models.Model(text_input, [ma_out, or_out, pl_out])

In [0]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1000, 256)    20480256    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 1000, 512)    1050624     embedding[0][0]                  
__________________________________________________________________________________________________
my__attention (My_Attention)    ((None, 512), (None, 263168      bidirectional[0][0]              
______________________________________________________________________________________________

In [0]:
model.compile(loss={'Brief_Att':'binary_crossentropy', 'Full_Att':'binary_crossentropy', 'Plane_No_Att':'binary_crossentropy'},
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              metrics={'Brief_Att':'acc', 'Full_Att':'acc', 'Plane_No_Att':'acc'})

In [0]:
hist = model.fit(x_train, [y_train, y_train, y_train], batch_size=32, epochs=100, validation_data=(x_test, [y_test, y_test, y_test]))

Train on 25000 samples, validate on 25000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100

# Loss  - Mask

In [0]:
text_input = tf.keras.layers.Input(shape=(1000,))
emb = tf.keras.layers.Embedding(80000+1, 256, mask_zero=True)(text_input)
bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True))(emb)

my_attention = My_Attention(512)

matt, weight = my_attention(bi_lstm)
or_attention = AttentionWithContext()(bi_lstm)
plane = tf.keras.layers.Flatten()(bi_lstm)

ma_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Brief_Att')(matt)
or_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Full_Att')(or_attention)
pl_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Plane_No_Att')(plane)


model = tf.keras.models.Model(text_input, [ma_out, or_out, pl_out])

In [0]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 256)    20480256    input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 1000, 512)    1050624     embedding_1[0][0]                
__________________________________________________________________________________________________
my__attention_1 (My_Attention)  ((None, 512), (None, 263168      bidirectional_1[0][0]            
____________________________________________________________________________________________

In [0]:
model.compile(loss={'Brief_Att':'binary_crossentropy', 'Full_Att':'binary_crossentropy', 'Plane_No_Att':'binary_crossentropy'},
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              metrics={'Brief_Att':'acc', 'Full_Att':'acc', 'Plane_No_Att':'acc'})

In [0]:
hist = model.fit(x_train, [y_train, y_train, y_train], batch_size=32, epochs=100, validation_data=(x_test, [y_test, y_test, y_test]))

Train on 25000 samples, validate on 25000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

## With Drop out

In [0]:
text_input = tf.keras.layers.Input(shape=(1000,))
emb = tf.keras.layers.Embedding(80000+1, 256, mask_zero=True)(text_input)
emb = tf.keras.layers.SpatialDropout1D(0.3)(emb)
bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, recurrent_dropout=0.3, dropout=0.3))(emb)

my_attention = My_Attention(512)

matt, weight = my_attention(bi_lstm)
or_attention = AttentionWithContext()(bi_lstm)
plane = tf.keras.layers.Flatten()(bi_lstm)

ma_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Brief_Att')(matt)
or_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Full_Att')(or_attention)
pl_out = tf.keras.layers.Dense(1, activation='sigmoid', name='Plane_No_Att')(plane)


model = tf.keras.models.Model(text_input, [ma_out, or_out, pl_out])

In [0]:
model.compile(loss={'Brief_Att':'binary_crossentropy', 'Full_Att':'binary_crossentropy', 'Plane_No_Att':'binary_crossentropy'},
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              metrics={'Brief_Att':'acc', 'Full_Att':'acc', 'Plane_No_Att':'acc'})

In [0]:
hist = model.fit(x_train, [y_train, y_train, y_train], batch_size=32, epochs=100, validation_data=(x_test, [y_test, y_test, y_test]))

Train on 25000 samples, validate on 25000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

## **)

In [0]:
# The Plane used a glovbal max pool <- does not identifiy the correct way to compare the result

hist = model.fit(x_train, [y_train, y_train, y_train], batch_size=32, epochs=100, validation_data=(x_test, [y_test, y_test, y_test]))

Train on 25000 samples, validate on 25000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

KeyboardInterrupt: ignored