In [1]:
import tensorflow as tf
print(tf.__version__)

2.0.0


In [2]:
physical_devices = tf.config.experimental.list_physical_devices()
print(*physical_devices, sep='\n')

PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')
PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')
PhysicalDevice(name='/physical_device:XLA_GPU:1', device_type='XLA_GPU')
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')


In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        # Restrict TensorFlow to only use the second GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
2 Physical GPUs, 1 Logical GPUs


In [8]:
# test_var = tf.constant([1.0,2.0,3.0], name='test_var')
# print(test_var)
# print("Variable placed on device: ", test_var.device)

# # with tf.device('/device:XLA_GPU:1'):
# a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='a')
# b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
# print(a.device)
# print(b.device)

# ?????
# print("Is GPU available: ", tf.test.is_gpu_available())

In [4]:
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import backend as K
from time import time
import random

In [5]:
from tensorboard import version;
print(version.VERSION)
%load_ext tensorboard

2.0.2


In [6]:
# from google.colab import drive
# drive.mount('/content/gdrive')

data_dir_path = './datasets/'

In [7]:
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])
((train_dataset, validation_dataset), test_dataset), info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True, data_dir=data_dir_path, download=False, split=(train_validation_split, tfds.Split.TEST))

In [8]:
print("info features: ", info.features)
encoder = info.features["text"].encoder
print("\n Vocabulary size: ", encoder.vocab_size)

info features:  FeaturesDict({
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
})

 Vocabulary size:  8185


In [9]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

# fills a buffer with buffer_size elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
# output_shapes returns the shape of each component of an element of this dataset.
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))

validation_dataset = validation_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(validation_dataset))

test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset))

The base class [RNN](https://www.tensorflow.org/api_docs/python/tf/keras/layers/RNN?version=stable) for recurrent layers inherits from class keras.layers.Layer. Each RNN cell isntance must have the following:

*   state_size attribute
*   output_size attriute
*   call(input_at_t, state_at_t) method, which return output_at_t and state_at_t_plus_1.
*   get_initial_state(inputs=None, batch_size=None, dtype=None) method that creates a tensor meant to be fed to call() as the initial state, if the user didn't specify any initial state via other means.



In [10]:
class CustomRNNCell(tf.keras.layers.Layer):

    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        
        if 'teacher_forcing' in kwargs:
            setattr(self, 'teacher_forcing', kwargs['teacher_forcing'])
        else:
            setattr(self, 'teacher_forcing', False)
        
        setattr(self, 'teacher_forcing_ratio', 0.4)
        
        super(CustomRNNCell, self).__init__(**kwargs)

    def build(self, input_shape):
        # add_weight is from base_layer class, used to add a new variable to the layer
        
        # kernel initializer, weight matrix used for the linear transformation of the inputs
        self.kernel = self.add_weight(shape=(input_shape[-1], self.units), initializer='uniform', 
                                      name='kernel')

        # recurrent initializer, weight matrix used for the linear transformation of the recurrent state.
        self.recurrent_kernel = self.add_weight(shape=(self.units, self.units), initializer='glorot_uniform', 
                                                name='recurrent_kernel')
        self.built = True

    def call(self, inputs, states):
        #if self.teacher_forcing:
            #force = True if random.random() < self.teacher_forcing_ratio else False
            #if force:
            # TODO: 
        prev_output = states[0]
        h = K.dot(inputs, self.kernel)
        output = h + K.dot(prev_output, self.recurrent_kernel)
        # For a simple RNN, the output_at_t and hidden_state_at_t_plus_1 is same.
        return output, [output]

In [11]:
class LossFunction:
    @staticmethod
    def binary_crossentropy(y_true, y_pred, from_logits=False):
        y_true = tf.cast(y_true, y_pred.dtype)
        def get_epsilon():
            # epsilon_value = 1e-7
            return tf.keras.backend.epsilon()

        if not from_logits:
            if y_pred.op.type == "Sigmoid":
                tf.reduce_mean(tf.math.add(tf.math.negative(tf.math.multiply(y_pred, y_true)), 
                                           tf.math.log(tf.math.add(1., tf.math.exp(y_pred)))))
            epsilon = get_epsilon()
            clipped_y_pred = tf.clip_by_value(y_pred, clip_value_min=epsilon, clip_value_max=(1.-epsilon))
            bce = tf.math.multiply(y_true, tf.math.log(tf.math.add(clipped_y_pred, epsilon)))
            temp = tf.math.multiply(tf.math.subtract(1., y_true), 
                                    tf.math.log(tf.math.add(epsilon, tf.math.subtract(1., clipped_y_pred))))
            return tf.math.negative(tf.reduce_mean(tf.math.add(bce, temp)))
        else:
            # - x * z + log(1 + exp(x)), x = logits, z = labels
            return tf.reduce_mean(tf.math.add(tf.math.negative(tf.math.multiply(y_pred, y_true)), 
                                              tf.math.log(tf.math.add(1., tf.math.exp(y_pred)))))

In [9]:
def generate_model(use_dropout=False):

    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Embedding(input_dim=encoder.vocab_size, output_dim=64))
    model.add(tf.keras.layers.RNN([CustomRNNCell(8)]))
    if use_dropout:
        model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(16, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    print(model.summary())
    
    return model

def compile_model(model):
    model.compile(loss=LossFunction.binary_crossentropy, optimizer='adam', metrics=['accuracy'])
    return model

def train_model(model, train_dataset, validation_dataset, epochs=20):
#     callbacks = [
#         # Write TensorBoard logs to `./tf_logs/rnn` directory
#         tf.keras.callbacks.TensorBoard(log_dir='./tf_logs/rnn_1', histogram_freq=10, write_graph=True)
#     ]
    callbacks = []
    
    history = model.fit(train_dataset, epochs=epochs, validation_data=validation_dataset, validation_steps=5,
                        callbacks=callbacks)

In [12]:
rm -rf ./tf_logs/rnn_1

In [15]:
K.clear_session()
model = generate_model()
model = compile_model(model)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 8)                 576       
_________________________________________________________________
dense (Dense)                (None, 16)                144       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 524,577
Trainable params: 524,577
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
%time train_model(model, train_dataset, validation_dataset)

Epoch 1/20
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 4h 44min 43s, sys: 31min 10s, total: 5h 15min 54s
Wall time: 1h 55min 16s


In [17]:
rnn_test_loss, rnn_test_acc = model.evaluate(test_dataset)

    391/Unknown - 128s 328ms/step - loss: 0.7559 - accuracy: 0.5006

In [21]:
model.save_weights('rnn_classification', save_format='tf')

In [22]:
new_model = generate_model()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
rnn_1 (RNN)                  (None, 8)                 576       
_________________________________________________________________
dense_2 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 524,577
Trainable params: 524,577
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
new_model = compile_model(new_model)

In [30]:
p = train_dataset.__iter__().next()

# This initializes the variables used by the optimizers, as well as any stateful metric variables
# The optimizer state is preserved as well, so we can resume training where we left off
new_model.train_on_batch(p[0], p[1])

[0.6932723, 0.40625]

In [31]:
# Load the state of the old model
new_model.load_weights('rnn_classification')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f935de62e80>

In [34]:
# Check that the model state has been preserved
old_predictions = model.predict(test_dataset)
new_predictions = new_model.predict(test_dataset)

In [38]:
np.testing.assert_allclose(old_predictions, new_predictions, rtol=1e-6, atol=1e-6)

## Using hyperparam tuning and regularization

Add dropout, dense layer size, 2 custom recurrent layers with teacher forcing, early stopping after 4 epochs and regularization

In [12]:
from tensorboard.plugins.hparams import api as hp

# Clear logs from previous runs 
# rm -rf ./tf_logs/rnn_classification

In [13]:
class Reg:
    @staticmethod
    def l1_reg(weight_matrix):
        return 0.01 * K.sum(K.abs(weight_matrix))
    
    @staticmethod
    def l2_reg(weight_matrix):
        return 0.01 * 0.5 * K.sum(K.square(weight_matrix))


In [14]:
import pdb

gradient_mean = {}

class GradHistory(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
      with tf.GradientTape() as tape:
        loss = self.model(self.model.trainable_weights)
        x = tape.gradient(loss, self.model.trainable_weights)
        # pdb.set_trace()
        for i in range(0, len(x)):
            if i in gradient_mean:
                gradient_mean[i].append(tf.reduce_mean(x[i]))
            else:
                gradient_mean[i] = [tf.reduce_mean(x[i])]

gradient_cb = GradHistory()

In [33]:
class MultiLayerRNN:
    
    def __init__(self, teacher_forcing=False):
        # number of units in 1st and 2nd recurrent layer, and the next dense layer
        self.num_units_l1 = hp.HParam('num_units_l1', hp.Discrete([8, 16, 32, 64]))
        self.num_units_l2 = hp.HParam('num_units_l2', hp.Discrete([8, 16, 32, 64]))
        self.num_units_l3 = hp.HParam('num_units_l3', hp.Discrete([10, 25, 40]))
        self.dropout = hp.HParam('dropout', hp.Discrete([0.3, 0.4]))
                
        # self.learning_rate = hp.HParam('learning_rate', hp.RealInterval(0.01, 0.5))
        # sgd = tf.optimizers.SGD(clipvalue=5.0, name='sgd')
        # self.optimizer = sgd
        self.optimizer = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))
        
        self.hparams = {self.optimizer: self.optimizer, self.num_units_l1: self.num_units_l1, 
                        self.num_units_l2: self.num_units_l2, self.num_units_l3: self.num_units_l3, 
                        self.dropout: self.dropout}
        
        self.teacher_forcing = teacher_forcing
        
        self.model = None
        
        METRIC_ACCURACY = 'accuracy'
        
        self.init_timestamp = int(time())
        
        print("MODEL INIT TIMESTAMP: ", str(self.init_timestamp))
        
        self.log_dir = "./tf_logs/rnn_classification_" + str(self.init_timestamp) +"/"
        with tf.summary.create_file_writer(self.log_dir).as_default():
            hp.hparams_config(hparams=[self.optimizer, self.num_units_l1, self.num_units_l2, 
                                       self.num_units_l3, self.dropout], 
                              metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],)
        
        return
    
    def loss_function(self, y_true, y_pred):
        r = 0.0
        for w in self.model.trainable_weights:
            r += Reg.l2_reg(w)
        l = LossFunction.binary_crossentropy(y_true, y_pred) + r
        return l
    
#     def loss_function(self, y_true, y_pred):
#         return tf.keras.losses.binary_crossentropy(y_true, y_pred)
    
    def generate_model(self, params):
        
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Embedding(input_dim=encoder.vocab_size, output_dim=64))
        self.model.add(tf.keras.layers.RNN([CustomRNNCell(params[self.num_units_l1]), 
                                            CustomRNNCell(params[self.num_units_l2])]))
        self.model.add(tf.keras.layers.Dropout(params[self.dropout]))
        self.model.add(tf.keras.layers.Dense(params[self.num_units_l3], activation='relu'))
        self.model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        
        print(self.model.summary())
        return self.model
    
    def get_model(self):
        return self.model
    
    def save_model(self):
        cp = time()
        model.save_weights(self.logdir + '/saved_models/model_' + cp, save_format='tf')
        return
    
    def compile_model(self, loss_function, optimizer):
        self.model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])
        return self.model
    
    def train_model(self, hparams, train_data, cross_validation_data, run_index):
        self.generate_model(hparams)
        self.compile_model(self.loss_function, hparams[self.optimizer])
        #self.compile_model(self.loss_function, self.optimizer)
        
        callbacks = [
            # Early stopping
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4),
            # logging train and validation accuracy each epoch
            tf.keras.callbacks.TensorBoard(log_dir=self.log_dir + run_index),
            # gradient sum callback
            gradient_cb,
            # checkpoint
            #tf.keras.callbacks.ModelCheckpoint(filepath=, save_weights_only=True, save_best_only=True)
        ]
        
        self.model.fit(train_data, epochs=10, validation_data=cross_validation_data, callbacks=callbacks, 
                       verbose=1)
        _, accuracy = self.model.evaluate(cross_validation_data)
        
        return accuracy
    
    def run(self, run_dir, hparams, train_data, cross_validation_data):
        K.clear_session()
        run_index = run_dir.split("-")[1]
        with tf.summary.create_file_writer(run_dir).as_default():
            # record the values used in this trial
            hp.hparams(hparams)
            acc = self.train_model(hparams, train_dataset, validation_dataset, run_index)
            tf.summary.scalar('accuracy', acc, step=int(run_index))
        return acc
    
    def random_search(self, train, cross_val, seed):
        rng = random.Random(seed)
        total_points_explored = 1
        
        acc_params = []
        
        for session_index in range(total_points_explored):
            hparams = {h: h.domain.sample_uniform(rng) for h in self.hparams}
            run_name = "run-%d" % session_index
            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})
            acc = self.run(self.log_dir + "tune/" + run_name, hparams, train, cross_val)
            session_index += 1
            acc_params.append((acc, hparams))
        
        return total_points_explored, acc_params
    
    def setup_model(self, params):
        K.clear_session()
        self.generate_model(params)
        self.compile_model(self.loss_function, params[self.optimizer])
        return
    
    def test_params(self, hpa):
        K.clear_session()
        params = {h: hpa[h.name] for h in self.hparams}
        self.generate_model(params)
        self.compile_model(self.loss_function, params[self.optimizer])
        self.run(self.log_dir + "tune/testhpgrad-1", params, train_dataset, validation_dataset)
        return
    
    def eval_test(self, test):
        _, acc = self.model.evaluate(test)
        print("Accuracy on test set: ", acc)
        return
            

In [15]:
m = MultiLayerRNN()

MODEL INIT TIMESTAMP:  1577701799


In [16]:
start_time = time()
points_explored, acc_params = m.random_search(train_dataset, validation_dataset, 42)
randomized_search_time = time() - start_time

--- Starting trial: run-0
{'optimizer': 'adam', 'num_units_l1': 8, 'num_units_l2': 32, 'num_units_l3': 10, 'dropout': 0.3}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 32)                1856      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                330       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 526,037
Trainable params: 526,037
Non-trainable params: 0
___________________________________________________________



Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "






Epoch 2/10
Epoch 3/10
Epoch 4/10
--- Starting trial: run-1
{'optimizer': 'adam', 'num_units_l1': 8, 'num_units_l2': 8, 'num_units_l3': 40, 'dropout': 0.4}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 8)                 704       
_________________________________________________________________
dropout (Dropout)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 40)                360       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 524,945
Trainable params: 524,945
Non-trainable params: 0
___________________________



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
--- Starting trial: run-2
{'optimizer': 'adam', 'num_units_l1': 8, 'num_units_l2': 8, 'num_units_l3': 10, 'dropout': 0.3}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 8)                 704       
_________________________________________________________________
dropout (Dropout)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 10)                90        
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 524,645
Trainable 



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
--- Starting trial: run-3
{'optimizer': 'adam', 'num_units_l1': 16, 'num_units_l2': 64, 'num_units_l3': 10, 'dropout': 0.4}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 64)                6400      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                650       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 530,901
Trainabl



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
--- Starting trial: run-4
{'optimizer': 'sgd', 'num_units_l1': 8, 'num_units_l2': 16, 'num_units_l3': 40, 'dropout': 0.4}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 16)                960       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 40)                680       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 525,521
Trainable 



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
--- Starting trial: run-5
{'optimizer': 'sgd', 'num_units_l1': 32, 'num_units_l2': 16, 'num_units_l3': 10, 'dropout': 0.4}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 16)                3840      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                170       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 527,861
Trainable params: 527,861
Non-trainable params: 0
__________________________



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
--- Starting trial: run-6
{'optimizer': 'adam', 'num_units_l1': 8, 'num_units_l2': 64, 'num_units_l3': 10, 'dropout': 0.4}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 64)                5184      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                650       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 529,685
Trainable params: 529,685
Non-trainable params: 0
__________________________



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
--- Starting trial: run-7
{'optimizer': 'sgd', 'num_units_l1': 32, 'num_units_l2': 8, 'num_units_l3': 40, 'dropout': 0.4}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 8)                 3392      
_________________________________________________________________
dropout (Dropout)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 40)                360       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 527,633
Trainable 



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
--- Starting trial: run-8
{'optimizer': 'adam', 'num_units_l1': 64, 'num_units_l2': 8, 'num_units_l3': 40, 'dropout': 0.4}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 8)                 8768      
_________________________________________________________________
dropout (Dropout)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 40)                360       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 533,009
Trainable params: 533,009
Non-trainable params: 0
__________________________



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10


In [17]:
print("Points explored: ", points_explored)
print("Accuracy for hparameters: ", acc_params)

Points explored:  9
Accuracy for hparameters:  [(0.4982, {HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='num_units_l1', domain=Discrete([8, 16, 32, 64]), display_name=None, description=None): 8, HParam(name='num_units_l2', domain=Discrete([8, 16, 32, 64]), display_name=None, description=None): 32, HParam(name='num_units_l3', domain=Discrete([10, 25, 40]), display_name=None, description=None): 10, HParam(name='dropout', domain=Discrete([0.3, 0.4]), display_name=None, description=None): 0.3}), (0.4982, {HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='num_units_l1', domain=Discrete([8, 16, 32, 64]), display_name=None, description=None): 8, HParam(name='num_units_l2', domain=Discrete([8, 16, 32, 64]), display_name=None, description=None): 8, HParam(name='num_units_l3', domain=Discrete([10, 25, 40]), display_name=None, description=None): 40, HParam(nam

In [18]:
opt_params = sorted(acc_params, key=lambda x: x[0], reverse=True)[0][1]

In [19]:
m.setup_model(opt_params)
m.eval_test(test_dataset)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 32)                1856      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                330       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 526,037
Trainable params: 526,037
Non-trainable params: 0
_________________________________________________________________
None
Accuracy on test set:  0.49988


In [20]:
print(randomized_search_time)

17247.224401712418


In [21]:
tensorboard --logdir ./tf_logs/rnn_classification_1577701799

![alt text](images/rnn_el.png "Loss plot")

![alt text](images/rnn_loss_nan.png "loss nan")

![alt text](images/rnn_hparams.png "hparams")

### Plotting the gradient 

In [25]:
hpa = {'optimizer': 'sgd', 'num_units_l1': 8, 'num_units_l2': 16, 'num_units_l3': 40, 'dropout': 0.4}

In [30]:
m1 = MultiLayerRNN()

MODEL INIT TIMESTAMP:  1577879738


In [47]:
m1.test_params(hpa)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 16)                960       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 40)                680       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 525,521
Trainable params: 525,521
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential"
_________________________________________________________________
Layer (type)             



Epoch 1/3




    235/Unknown - 267s 1s/step - loss: 0.6932 - accuracy: 0.5012> <ipython-input-44-468192c76b57>(12)on_epoch_end()
-> for i in range(0, len(x)):
(Pdb) continue
Epoch 2/3
-> for i in range(0, len(x)):
(Pdb) tf.reduce_mean(self.model.trainable_weights[0])
<tf.Tensor: id=22043, shape=(), dtype=float32, numpy=nan>
(Pdb) continue
Epoch 3/3
-> for i in range(0, len(x)):
(Pdb) continue


### Clipping the value of gradient

In [16]:
sgd_opt = tf.optimizers.SGD(clipvalue=5.0)
hpa = {'optimizer': sgd_opt, 'num_units_l1': 8, 'num_units_l2': 16, 'num_units_l3': 40, 'dropout': 0.4}

In [34]:
m2 = MultiLayerRNN()

MODEL INIT TIMESTAMP:  1577882151


In [35]:
m2.test_params(hpa)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 16)                960       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 40)                680       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 41        
Total params: 525,521
Trainable params: 525,521
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential"
_________________________________________________________________
Layer (type)             



Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


![alt text](rnn_clipgradient.png "Gradient clipped")

The dark blue lines are the training accuracy and loss, and the light blue is the validation accuracy and loss.

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(3, 3)
fig.suptitle('Gradients with epochs')

for i in range(0, 3):
    for j in range(0, 3):
        ax[i, j].plot(list(range(1, len(grad_mean[i+j])+1 )), grad_mean[i+j])
plt.plot()