In [None]:
!pip install --upgrade grpcio
!pip install tensorflow==2.0
!pip install tensorflow-gpu

In [1]:
import tensorflow as tf
print(tf.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


2.0.0-beta1


In [2]:
physical_devices = tf.config.experimental.list_physical_devices()
print(*physical_devices, sep='\n')

PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')
PhysicalDevice(name='/physical_device:XLA_GPU:1', device_type='XLA_GPU')
PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')


In [3]:
# GPU0 is busy, use GPU1
try:
    tf.config.experimental.set_visible_devices(physical_devices[2], 'XLA_GPU')
    tf.config.experimental.get_visible_devices()
except RuntimeError as e:
    print(e)

In [4]:
test_var = tf.constant([1.0,2.0,3.0], name='test_var')
print(test_var)
print("Variable placed on device: ", test_var.device)

with tf.device('/device:XLA_GPU:1'):
    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='a')
    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
    print(a.device)
    print(b.device)

# ?????
print("Is GPU available: ", tf.test.is_gpu_available())

tf.Tensor([1. 2. 3.], shape=(3,), dtype=float32)
Variable placed on device:  /job:localhost/replica:0/task:0/device:CPU:0
/job:localhost/replica:0/task:0/device:XLA_GPU:1
/job:localhost/replica:0/task:0/device:XLA_GPU:1
Is GPU available:  False


In [5]:
# Return a list of logical devices created by runtime. Logical devices may correspond to physical devices or remote devices in the cluster. 
# Operations and tensors may be placed on these devices by using the name of the LogicalDevice.
logical_devices = tf.config.experimental.list_logical_devices()
print(*logical_devices, sep="\n")

LogicalDevice(name='/job:localhost/replica:0/task:0/device:CPU:0', device_type='CPU')
LogicalDevice(name='/job:localhost/replica:0/task:0/device:XLA_GPU:0', device_type='XLA_GPU')
LogicalDevice(name='/job:localhost/replica:0/task:0/device:XLA_GPU:1', device_type='XLA_GPU')
LogicalDevice(name='/job:localhost/replica:0/task:0/device:XLA_CPU:0', device_type='XLA_CPU')


In [6]:
print(dir(tf.config.optimizer))
print(tf.config.optimizer.get_jit())

['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'get_experimental_options', 'get_jit', 'set_experimental_options', 'set_jit']
False


In [2]:
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import backend as K
from time import time
import random

In [3]:
from tensorboard import version;
print(version.VERSION)
%load_ext tensorboard

2.1.0


In [4]:
# from google.colab import drive
# drive.mount('/content/gdrive')

data_dir_path = './datasets/'

In [5]:
train_validation_split = tfds.Split.TRAIN.subsplit([7, 3])
((train_dataset, validation_dataset), test_dataset), info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True, data_dir=data_dir_path, download=False, split=(train_validation_split, tfds.Split.TEST))

In [6]:
print("info features: ", info.features)
encoder = info.features["text"].encoder
print("\n Vocabulary size: ", encoder.vocab_size)

info features:  FeaturesDict({
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
})

 Vocabulary size:  8185


In [7]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

# fills a buffer with buffer_size elements, then randomly samples elements from this buffer, replacing the selected elements with new elements. For perfect shuffling, a buffer size greater than or equal to the full size of the dataset is required.
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
# output_shapes returns the shape of each component of an element of this dataset.
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))

validation_dataset = validation_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(validation_dataset))

test_dataset = test_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_dataset))

The base class [RNN](https://www.tensorflow.org/api_docs/python/tf/keras/layers/RNN?version=stable) for recurrent layers inherits from class keras.layers.Layer. Each RNN cell isntance must have the following:

*   state_size attribute
*   output_size attriute
*   call(input_at_t, state_at_t) method, which return output_at_t and state_at_t_plus_1.
*   get_initial_state(inputs=None, batch_size=None, dtype=None) method that creates a tensor meant to be fed to call() as the initial state, if the user didn't specify any initial state via other means.



In [8]:
class CustomRNNCell(tf.keras.layers.Layer):

    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        
        if 'teacher_forcing' in kwargs:
            setattr(self, 'teacher_forcing', kwargs['teacher_forcing'])
        else:
            setattr(self, 'teacher_forcing', False)
        
        super(CustomRNNCell, self).__init__(**kwargs)

    def build(self, input_shape):
        # add_weight is from base_layer class, used to add a new variable to the layer
        
        # kernel initializer, weight matrix used for the linear transformation of the inputs
        self.kernel = self.add_weight(shape=(input_shape[-1], self.units), initializer='uniform', name='kernel')

        # recurrent initializer, weight matrix used for the linear transformation of the recurrent state.
        self.recurrent_kernel = self.add_weight(shape=(self.units, self.units), initializer='uniform', name='recurrent_kernel')
        self.built = True

    def call(self, inputs, states):
        # if self.teacher_forcing:
            # TODO: Figure out how to incorporate decay. Send the true label to next hidden state. 
        prev_output = states[0]
        h = K.dot(inputs, self.kernel)
        output = h + K.dot(prev_output, self.recurrent_kernel)
        # For a simple RNN, the output_at_t and hidden_state_at_t_plus_1 is same.
        return output, [output]

In [9]:
def generate_model(use_dropout=False):

    model = tf.keras.Sequential()

    model.add(tf.keras.layers.Embedding(input_dim=encoder.vocab_size, output_dim=64))
    model.add(tf.keras.layers.RNN([CustomRNNCell(8)]))
    if use_dropout:
        model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(16, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    print(model.summary())
    
    return model

In [9]:
class LossFunction:
    @staticmethod
    def binary_crossentropy(y_true, y_pred, from_logits=False):
        y_true = tf.cast(y_true, y_pred.dtype)
        def get_epsilon():
            # epsilon_value = 1e-7
            return tf.keras.backend.epsilon()

        if not from_logits:
            if y_pred.op.type == "Sigmoid":
                tf.reduce_mean(tf.math.add(tf.math.negative(tf.math.multiply(y_pred, y_true)), tf.math.log(tf.math.add(1., tf.math.exp(y_pred)))))
            epsilon = get_epsilon()
            clipped_y_pred = tf.clip_by_value(y_pred, clip_value_min=epsilon, clip_value_max=(1.-epsilon))
            bce = tf.math.multiply(y_true, tf.math.log(tf.math.add(clipped_y_pred, epsilon)))
            temp = tf.math.multiply(tf.math.subtract(1., y_true), tf.math.log(tf.math.add(epsilon, tf.math.subtract(1., clipped_y_pred))))
            return tf.math.negative(tf.reduce_mean(tf.math.add(bce, temp)))
        else:
            # - x * z + log(1 + exp(x)), x = logits, z = labels
            return tf.reduce_mean(tf.math.add(tf.math.negative(tf.math.multiply(y_pred, y_true)), tf.math.log(tf.math.add(1., tf.math.exp(y_pred)))))

In [11]:
def compile_model(model):
    model.compile(loss=LossFunction.binary_crossentropy, optimizer='adam', metrics=['accuracy'])
    return model

In [12]:
rm -rf ./tf_logs/rnn_1

In [13]:
def train_model(model, train_dataset, validation_dataset, epochs=20):
#     callbacks = [
#         # Write TensorBoard logs to `./tf_logs/rnn` directory
#         tf.keras.callbacks.TensorBoard(log_dir='./tf_logs/rnn_1', histogram_freq=10, write_graph=True)
#     ]
    callbacks = []
    
    history = model.fit(train_dataset, epochs=epochs, validation_data=validation_dataset, validation_steps=5, callbacks=callbacks)

In [14]:
K.clear_session()

In [15]:
model = generate_model()
model = compile_model(model)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 8)                 576       
_________________________________________________________________
dense (Dense)                (None, 16)                144       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 524,577
Trainable params: 524,577
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
%time train_model(model, train_dataset, validation_dataset)

Epoch 1/20
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 4h 44min 43s, sys: 31min 10s, total: 5h 15min 54s
Wall time: 1h 55min 16s


In [17]:
rnn_test_loss, rnn_test_acc = model.evaluate(test_dataset)

    391/Unknown - 128s 328ms/step - loss: 0.7559 - accuracy: 0.5006

In [21]:
model.save_weights('rnn_classification', save_format='tf')

In [22]:
new_model = generate_model()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
rnn_1 (RNN)                  (None, 8)                 576       
_________________________________________________________________
dense_2 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 524,577
Trainable params: 524,577
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
new_model = compile_model(new_model)

In [30]:
p = train_dataset.__iter__().next()

# This initializes the variables used by the optimizers, as well as any stateful metric variables
# The optimizer state is preserved as well, so we can resume training where we left off
new_model.train_on_batch(p[0], p[1])

[0.6932723, 0.40625]

In [31]:
# Load the state of the old model
new_model.load_weights('rnn_classification')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f935de62e80>

In [34]:
# Check that the model state has been preserved
old_predictions = model.predict(test_dataset)

In [35]:
new_predictions = new_model.predict(test_dataset)

In [38]:
np.testing.assert_allclose(old_predictions, new_predictions, rtol=1e-6, atol=1e-6)

## Using hyperparam tuning and regularization

Add dropout, dense layer size, 2 custom recurrent layers with teacher forcing, early stopping after 4 epochs and regularization

In [12]:
from tensorboard.plugins.hparams import api as hp

# Clear logs from previous runs 
# rm -rf ./tf_logs/rnn_classification

In [13]:
class Reg:
    @staticmethod
    def l1_reg(weight_matrix):
        return 0.01 * K.sum(K.abs(weight_matrix))
    
    @staticmethod
    def l2_reg(weight_matrix):
        return 0.01 * K.sum(K.square(x))


In [14]:
class MultiLayerRNN:
    
    def __init__(self, teacher_forcing=False):
        # number of units in 1st and 2nd recurrent layer, and the next dense layer
        self.num_units_l1 = hp.HParam('num_units_l1', hp.Discrete([16, 32]))
        self.num_units_l2 = hp.HParam('num_units_l2', hp.Discrete([16, 32]))
        self.num_units_l3 = hp.HParam('num_units_l3', hp.Discrete([20, 30]))
        self.dropout = hp.HParam('dropout', hp.Discrete([0.3, 0.4]))
        
#         self.learning_rate = hp.HParam('learning_rate', hp.RealInterval(0.01, 0.5))
        self.optimizer = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))
        
        self.hparams = {self.optimizer: self.optimizer, self.num_units_l1: self.num_units_l1, self.num_units_l2: self.num_units_l2, self.num_units_l3: self.num_units_l3, self.dropout: self.dropout}
        
        self.teacher_forcing = teacher_forcing
        
        self.model = None
        
        METRIC_ACCURACY = 'accuracy'
        
        self.log_dir = "./tf_logs/rnn_classification/"
        with tf.summary.create_file_writer(self.log_dir).as_default():
            hp.hparams_config(hparams=[self.optimizer, self.num_units_l1, self.num_units_l2, self.num_units_l3, self.dropout], metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],)
        
        return
    
    def loss_function(self, y_true, y_pred):
        r = 0.0
        for w in self.model.trainable_weights:
            r += Reg.l1_reg(w)
        l = LossFunction.binary_crossentropy(y_true, y_pred) + r
        return l/2.0
    
#     def loss_function(self, y_true, y_pred):
#         return tf.keras.losses.binary_crossentropy(y_true, y_pred)
    
    def generate_model(self, params):
        self.model = tf.keras.Sequential()
        self.model.add(tf.keras.layers.Embedding(input_dim=encoder.vocab_size, output_dim=64))
        self.model.add(tf.keras.layers.RNN([CustomRNNCell(params[self.num_units_l1]), CustomRNNCell(params[self.num_units_l2])]))
        self.model.add(tf.keras.layers.Dropout(params[self.dropout]))
        self.model.add(tf.keras.layers.Dense(params[self.num_units_l3], activation='relu'))
        self.model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        
        print(self.model.summary())
        return self.model
    
    def get_model(self):
        return self.model
    
    def save_model(self):
        cp = time()
        model.save_weights('rnn_classification_' + cp, save_format='tf')
        return
    
    def compile_model(self, loss_function, optimizer):
        self.model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])
        return self.model
    
    def train_model(self, hparams, train_data, cross_validation_data):
        self.generate_model(hparams)
        self.compile_model(self.loss_function, hparams[self.optimizer])
#         self.compile_model(self.loss_function, self.optimizer)
        
        callbacks = [
            # Early stopping
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)
        ]
        
        self.model.fit(train_data, epochs=15, validation_data=cross_validation_data, callbacks=callbacks,verbose=1)
        _, accuracy = self.model.evaluate(cross_validation_data)
        
        return accuracy
    
    def run(self, run_dir, hparams, train_data, cross_validation_data):
        with tf.summary.create_file_writer(run_dir).as_default():
            # record the values used in this trial
            hp.hparams(hparams)
            acc = self.train_model(hparams, train_dataset, validation_dataset)
            tf.summary.scalar('accuracy', acc, step=1)
        return acc
    
    def random_search(self, train, cross_val, seed):
        rng = random.Random(seed)
        total_points_explored = 15
        
        acc_params = []
        
        for session_index in range(total_points_explored):
            hparams = {h: h.domain.sample_uniform(rng) for h in self.hparams}
            run_name = "run-%d" % session_index
            print('--- Starting trial: %s' % run_name)
            print({h.name: hparams[h] for h in hparams})
            acc = self.run(self.log_dir + "tune/" + run_name, hparams, train, cross_val)
            session_index += 1
            acc_params.append((acc, hparams))
        
        return total_points_explored, acc_params
    
    def setup_model(self):
        return
    
    def eval_test(self, test):
        _, acc = self.model.evaluate(test)
        print(acc)
        return
            

In [15]:
m = MultiLayerRNN()

In [None]:
start_time = time()
points_explored, acc_params = m.random_search(train_dataset, validation_dataset, 42)
randomized_search_time = time() - start_time

--- Starting trial: run-0
{'optimizer': 'adam', 'num_units_l1': 16, 'num_units_l2': 32, 'num_units_l3': 20, 'dropout': 0.3}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          523840    
_________________________________________________________________
rnn (RNN)                    (None, 32)                2816      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 20)                660       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 21        
Total params: 527,337
Trainable params: 527,337
Non-trainable params: 0
__________________________________________________________

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
    118/Unknown - 36s 301ms/step - loss: 0.6762 - accuracy: 0.4976--- Starting trial: run-1
{'optimizer': 'adam', 'num_units_l1': 16, 'num_units_l2': 16, 'num_units_l3': 30, 'dropout': 0.3}
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
rnn_1 (RNN)                  (None, 16)                1792      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                510       
_________________________________________________________________
dense_3 (Dense)              (None, 1)  

Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
    118/Unknown - 37s 313ms/step - loss: 0.6716 - accuracy: 0.4976--- Starting trial: run-5
{'optimizer': 'adam', 'num_units_l1': 16, 'num_units_l2': 32, 'num_units_l3': 20, 'dropout': 0.3}
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
rnn_5 (RNN)                  (None, 32)                2816      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 20)                660       
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 21        
Total p

In [17]:
print("Points explored: ", points_explored)
print("Accuracy for hparameters: ", acc_params)

Points explored:  15
Accuracy for hparameters:  [(0.4976, {HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='num_units_l1', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='num_units_l2', domain=Discrete([16, 32]), display_name=None, description=None): 32, HParam(name='num_units_l3', domain=Discrete([20, 30]), display_name=None, description=None): 20, HParam(name='dropout', domain=Discrete([0.3, 0.4]), display_name=None, description=None): 0.3}), (0.4976, {HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='num_units_l1', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='num_units_l2', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='num_units_l3', domain=Discrete([20, 30]), display_name=None, description=None): 30, HParam(name='dropout', domain=Discrete([0.

In [18]:
sorted(acc_params, key=lambda x: x[0], reverse=True)

[(0.4976,
  {HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam',
   HParam(name='num_units_l1', domain=Discrete([16, 32]), display_name=None, description=None): 16,
   HParam(name='num_units_l2', domain=Discrete([16, 32]), display_name=None, description=None): 32,
   HParam(name='num_units_l3', domain=Discrete([20, 30]), display_name=None, description=None): 20,
   HParam(name='dropout', domain=Discrete([0.3, 0.4]), display_name=None, description=None): 0.3}),
 (0.4976,
  {HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam',
   HParam(name='num_units_l1', domain=Discrete([16, 32]), display_name=None, description=None): 16,
   HParam(name='num_units_l2', domain=Discrete([16, 32]), display_name=None, description=None): 16,
   HParam(name='num_units_l3', domain=Discrete([20, 30]), display_name=None, description=None): 30,
   HParam(name='dropout', domain=Discrete([0.3, 0.4]), display_n

In [19]:
m.eval_test(test_dataset)

    391/Unknown - 186s 475ms/step - loss: 0.6726 - accuracy: 0.50000.5


In [20]:
print(randomized_search_time)

52721.25437951088
