<a href="https://colab.research.google.com/github/ravitata/tensorflow2/blob/master/c2w3_Coding_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

print('GPU name: {}'.format(tf.test.gpu_device_name()))

2.3.0
GPU name: /device:GPU:0


# Sequence modelling 

## Coding tutorials
 #### 1.  The IMDb dataset
 #### 2. Padding and masking sequence data
 #### 3. The `Embedding` layer
 #### 4. The Embedding Projector
 #### 5. Recurrent neural network layers
 #### 6. Stacked RNNs and the `Bidirectional` wrapper

***
<a id="coding_tutorial_1"></a>
## The IMDb Dataset

#### Load the IMDB review sentiment dataset

In [None]:
# Import imdb
import tensorflow.keras.datasets.imdb as imdb

In [None]:
# Download and assign the data set using load_data()

(train_x, train_y), (test_x, test_y) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


#### Inspect the dataset

In [None]:
# Inspect the type of the data

type(train_x)

numpy.ndarray

In [None]:
# Inspect the shape of the data
train_x.shape


(25000,)

In [None]:
# Display the first dataset element input
# Notice encoding
train_x[0]


In [None]:
# Display the first dataset element output
#0 is negative and 1 is positive
train_y[1]


0

#### Load dataset with different options

In [None]:
# Load the dataset with defaults


# ~/.keras/dataset/

In [None]:
# Limit the vocabulary to the top 500 words using num_words



In [None]:
# Ignore the top 10 most frequent words using skip_top



In [None]:
# Limit the sequence lengths to 500 using maxlen



In [None]:
 # Use '1' as the character that indicates the start of a sequence

 

#### Explore the dataset word index

In [None]:
# Load the imdb word index using get_word_index()



In [None]:
# View the word index as a dictionary,
# accounting for index_from.



In [None]:
# Retrieve a specific word's index



In [None]:
# View an input sentence



In [None]:
# Get the sentiment value



---
<a id="coding_tutorial_2"></a>
## Padding and Masking Sequence Data

In [None]:
# Load the imdb data set



#### Preprocess the data with padding

In [None]:
# Inspect the input data shape



In [None]:
# Pad the inputs to the maximum length using maxlen



In [None]:
# Inspect the output data shape



#### Create a Masking layer

In [None]:
# Import numpy 



In [None]:
# Masking expects to see (batch, sequence, features)
# Create a dummy feature dimension using expand_dims



In [None]:
# Create a Masking layer 



In [None]:
# Pass tf_x_train to it



In [None]:
# Look at the dataset



In [None]:
# Look at the ._keras_mask for the dataset



***
<a id="coding_tutorial_3"></a>
## The Embedding layer

#### Create and apply an `Embedding` layer

In [10]:
# Create an embedding layer using layers.Embedding
# Specify input_dim, output_dim, input_length
embedding_layer = tf.keras.layers.Embedding(input_dim=501, output_dim=16)


In [12]:
# Inspect an Embedding layer output for a fixed input
# Expects an input of shape (batch, sequence, feature)

sequence_of_indices = tf.constant([[[0], [2], [2], [100]]])
sequence_of_embeddings = embedding_layer(sequence_of_indices)
sequence_of_embeddings

<tf.Tensor: shape=(1, 4, 1, 16), dtype=float32, numpy=
array([[[[ 0.00810393,  0.04967042, -0.025184  ,  0.04747998,
           0.00199376, -0.01716508,  0.00439969,  0.0062418 ,
           0.01497436, -0.04275171,  0.04079243, -0.04360881,
           0.04392839,  0.00374924,  0.02813797, -0.0102082 ]],

        [[ 0.04484634, -0.01228765, -0.00107229,  0.01572226,
           0.02461927,  0.00312785, -0.00259401,  0.04195099,
           0.00334945, -0.01513531, -0.02064841, -0.00047464,
          -0.01123105, -0.00222614,  0.03835765, -0.03339886]],

        [[ 0.04484634, -0.01228765, -0.00107229,  0.01572226,
           0.02461927,  0.00312785, -0.00259401,  0.04195099,
           0.00334945, -0.01513531, -0.02064841, -0.00047464,
          -0.01123105, -0.00222614,  0.03835765, -0.03339886]],

        [[-0.03240397, -0.01391277,  0.00723523, -0.02849907,
          -0.01057415,  0.00429413, -0.03553858, -0.02087495,
          -0.0345932 , -0.01485723, -0.03385388, -0.03002881,
      

In [29]:
# Inspect the Embedding layer weights using get_weights()
embedding_layer.get_weights()[0]


array([[ 0.00810393,  0.04967042, -0.025184  , ...,  0.00374924,
         0.02813797, -0.0102082 ],
       [-0.00191615, -0.04229257,  0.02709929, ..., -0.01686269,
         0.02216927, -0.01182345],
       [ 0.04484634, -0.01228765, -0.00107229, ..., -0.00222614,
         0.03835765, -0.03339886],
       ...,
       [-0.04735955, -0.03957223,  0.00561972, ...,  0.00572225,
        -0.01913064, -0.01186786],
       [-0.03833694,  0.01652941, -0.00874736, ...,  0.02097174,
        -0.02062801,  0.01471228],
       [ 0.00407232,  0.00773715, -0.00947857, ..., -0.04266149,
         0.02619291,  0.01902616]], dtype=float32)

In [30]:
embedding_layer.get_weights()

[array([[ 0.00810393,  0.04967042, -0.025184  , ...,  0.00374924,
          0.02813797, -0.0102082 ],
        [-0.00191615, -0.04229257,  0.02709929, ..., -0.01686269,
          0.02216927, -0.01182345],
        [ 0.04484634, -0.01228765, -0.00107229, ..., -0.00222614,
          0.03835765, -0.03339886],
        ...,
        [-0.04735955, -0.03957223,  0.00561972, ...,  0.00572225,
         -0.01913064, -0.01186786],
        [-0.03833694,  0.01652941, -0.00874736, ...,  0.02097174,
         -0.02062801,  0.01471228],
        [ 0.00407232,  0.00773715, -0.00947857, ..., -0.04266149,
          0.02619291,  0.01902616]], dtype=float32)]

In [33]:
# Get the embedding for the 14th index
embedding_layer.get_weights()[0][2,:]


array([ 0.04484634, -0.01228765, -0.00107229,  0.01572226,  0.02461927,
        0.00312785, -0.00259401,  0.04195099,  0.00334945, -0.01513531,
       -0.02064841, -0.00047464, -0.01123105, -0.00222614,  0.03835765,
       -0.03339886], dtype=float32)

#### Create and apply an `Embedding` layer that uses `mask_zero=True`

In [41]:
# Create a layer that uses the mask_zero kwarg
masked_embedding_layer = tf.keras.layers.Embedding(input_dim=501, output_dim=16, mask_zero=True)


In [42]:
# Apply this layer to the sequence and see the _keras_mask property
masked_sequence_embeddings = masked_embedding_layer(sequence_of_indices)
masked_sequence_embeddings._keras_mask


<tf.Tensor: shape=(1, 4, 1), dtype=bool, numpy=
array([[[False],
        [ True],
        [ True],
        [ True]]])>

In [43]:
masked_sequence_embeddings

<tf.Tensor: shape=(1, 4, 1, 16), dtype=float32, numpy=
array([[[[ 0.01779293, -0.00584376,  0.02496178, -0.0145584 ,
          -0.03268683,  0.03629377,  0.04492816,  0.02570543,
           0.00900816, -0.00496691,  0.01168574, -0.00079956,
           0.02979052, -0.03977318, -0.04101504, -0.02854302]],

        [[ 0.00681609, -0.02622942, -0.03634335,  0.04097782,
           0.00318112, -0.0146475 ,  0.04030347,  0.00328743,
          -0.01674535, -0.03323547, -0.01198513,  0.01550057,
           0.03020797,  0.01508453, -0.00365441, -0.02159153]],

        [[ 0.00681609, -0.02622942, -0.03634335,  0.04097782,
           0.00318112, -0.0146475 ,  0.04030347,  0.00328743,
          -0.01674535, -0.03323547, -0.01198513,  0.01550057,
           0.03020797,  0.01508453, -0.00365441, -0.02159153]],

        [[ 0.03105916, -0.01998161,  0.00060954,  0.01184865,
           0.03457171,  0.00481207, -0.03060721, -0.04521412,
           0.02188123, -0.04405861, -0.01313891, -0.03282284,
      

---
<a id="coding_tutorial_4"></a>
## The Embedding Projector

#### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Load and preprocess the IMDb data

In [None]:
# A function to load and preprocess the IMDB dataset

def get_and_pad_imdb_dataset(num_words=10000, maxlen=None, index_from=2):
    from tensorflow.keras.datasets import imdb

    # Load the reviews
    (x_train, y_train), (x_test, y_test) = imdb.load_data(path='imdb.npz',
                                                          num_words=num_words,
                                                          skip_top=0,
                                                          maxlen=maxlen,
                                                          start_char=1,
                                                          oov_char=2,
                                                          index_from=index_from)

    x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
                                                        maxlen=None,
                                                        padding='pre',
                                                        truncating='pre',
                                                        value=0)
    
    x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
                                                           maxlen=None,
                                                           padding='pre',
                                                           truncating='pre',
                                                           value=0)
    return (x_train, y_train), (x_test, y_test)

In [None]:
# Load the dataset



In [None]:
# A function to get the dataset word index

def get_imdb_word_index(num_words=10000, index_from=2):
    imdb_word_index = tf.keras.datasets.imdb.get_word_index(
                                        path='imdb_word_index.json')
    imdb_word_index = {key: value + index_from for
                       key, value in imdb_word_index.items() if value <= num_words-index_from}
    return imdb_word_index

In [None]:
# Get the word index



In [None]:
# Swap the keys and values of the word index



In [None]:
# View the first dataset example sentence



#### Build an Embedding layer into a model

In [None]:
# Get the maximum token value



In [None]:
# Specify an embedding dimension



In [None]:
# Build a model using Sequential:
#     1. Embedding layer
#     2. GlobalAveragePooling1D
#     3. Dense



In [None]:
# Functional API refresher: use the Model to build the same model



In [None]:
model.summary()

#### Compile, train, and evaluate the model

In [None]:
# Compile the model with a binary cross-entropy loss



In [None]:
# Train the model using .fit(), savng its history



In [None]:
# Plot the training and validation accuracy

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

history_dict = history.history

acc      = history_dict['accuracy']
val_acc  = history_dict['val_accuracy']
loss     = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(14,5))
plt.plot(epochs, acc, marker='.', label='Training acc')
plt.plot(epochs, val_acc, marker='.', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Classification accuracy')
plt.legend(loc='lower right')
plt.ylim(0, 1);

#### The TensorFlow embedding projector

The Tensorflow embedding projector can be found [here](https://projector.tensorflow.org/).

In [None]:
# Retrieve the embedding layer's weights from the trained model



In [None]:
# Save the word Embeddings to tsv files
# Two files: 
#     one contains the embedding labels (meta.tsv),
#     one contains the embeddings (vecs.tsv)

import io
from os import path

out_v = io.open(path.join('data', 'vecs.tsv'), 'w', encoding='utf-8')
out_m = io.open(path.join('data', 'meta.tsv'), 'w', encoding='utf-8')

k = 0

for word, token in word_index.items():
    if k != 0:
        out_m.write('\n')
        out_v.write('\n')
    
    out_v.write('\t'.join([str(x) for x in weights[token]]))
    out_m.write(word)
    k += 1
    
out_v.close()
out_m.close()
# beware large collections of embeddings!

---
<a id="coding_tutorial_5"></a>
## Recurrent neural network layers

#### Initialize and pass an input to a SimpleRNN layer

In [None]:
# Create a SimpleRNN layer and test it



In [None]:
# Note that only the final cell output is returned



#### Load and transform the IMDB review sentiment dataset

In [None]:
# A function to load and preprocess the IMDB dataset

def get_and_pad_imdb_dataset(num_words=10000, maxlen=None, index_from=2):
    from tensorflow.keras.datasets import imdb

    # Load the reviews
    (x_train, y_train), (x_test, y_test) = imdb.load_data(path='imdb.npz',
                                                          num_words=num_words,
                                                          skip_top=0,
                                                          maxlen=maxlen,
                                                          start_char=1,
                                                          oov_char=2,
                                                          index_from=index_from)

    x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
                                                        maxlen=None,
                                                        padding='pre',
                                                        truncating='pre',
                                                        value=0)
    
    x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
                                                           maxlen=None,
                                                           padding='pre',
                                                           truncating='pre',
                                                           value=0)
    return (x_train, y_train), (x_test, y_test)

In [None]:
# Load the dataset




In [None]:
# A function to get the dataset word index

def get_imdb_word_index(num_words=10000, index_from=2):
    imdb_word_index = tf.keras.datasets.imdb.get_word_index(
                                        path='imdb_word_index.json')
    imdb_word_index = {key: value + index_from for
                       key, value in imdb_word_index.items() if value <= num_words-index_from}
    return imdb_word_index

In [None]:
# Get the word index using get_imdb_word_index()



#### Create a recurrent neural network model

In [None]:
# Get the maximum index value



In [None]:
# Using Sequential, build the model:
# 1. Embedding.
# 2. LSTM.
# 3. Dense.



#### Compile and fit the model

In [None]:
# Compile the model with binary cross-entropy loss



In [None]:
# Fit the model and save its training history



#### Plot learning curves

In [None]:
# Plot the training and validation accuracy

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

history_dict = history.history

acc      = history_dict['accuracy']
val_acc  = history_dict['val_accuracy']
loss     = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(14,5))
plt.plot(epochs, acc, marker='.', label='Training acc')
plt.plot(epochs, val_acc, marker='.', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Classification accuracy')
plt.legend(loc='lower right')
plt.ylim(0, 1);

#### Make predictions with the model

In [None]:
# View the first test data example sentence
# (invert the word index)



In [None]:
# Get the model prediction using model.predict()



In [None]:
# Get the corresponding label



---
<a id="coding_tutorial_6"></a>
## Stacked RNNs and the Bidirectional wrapper

#### Load and transform the IMDb review sentiment dataset

In [None]:
# A function to load and preprocess the IMDB dataset

def get_and_pad_imdb_dataset(num_words=10000, maxlen=None, index_from=2):
    from tensorflow.keras.datasets import imdb

    # Load the reviews
    (x_train, y_train), (x_test, y_test) = imdb.load_data(path='imdb.npz',
                                                          num_words=num_words,
                                                          skip_top=0,
                                                          maxlen=maxlen,
                                                          start_char=1,
                                                          oov_char=2,
                                                          index_from=index_from)

    x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
                                                        maxlen=None,
                                                        padding='pre',
                                                        truncating='pre',
                                                        value=0)
    
    x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test,
                                                           maxlen=None,
                                                           padding='pre',
                                                           truncating='pre',
                                                           value=0)
    return (x_train, y_train), (x_test, y_test)

In [None]:
# Load the dataset



In [None]:
# A function to get the dataset word index

def get_imdb_word_index(num_words=10000, index_from=2):
    imdb_word_index = tf.keras.datasets.imdb.get_word_index(
                                        path='imdb_word_index.json')
    imdb_word_index = {key: value + index_from for
                       key, value in imdb_word_index.items() if value <= num_words-index_from}
    return imdb_word_index

In [None]:
# Get the word index using get_imdb_word_index()



#### Build stacked and bidirectional recurrent models

In [None]:
# Get the maximum index value and specify an embedding dimension



In [None]:
# Using Sequential, build a stacked LSTM model via return_sequences=True



In [None]:
# Using Sequential, build a bidirectional RNN with merge_mode='sum'



In [None]:
# Create a model featuring both stacked recurrent layers and a bidirectional layer



#### Compile and fit the model

In [None]:
# Compile the model



In [None]:
# Train the model, saving its history



In [None]:
# Plot the training and validation accuracy

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

history_dict = history.history

acc      = history_dict['accuracy']
val_acc  = history_dict['val_accuracy']
loss     = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(14,5))
plt.plot(epochs, acc, marker='.', label='Training acc')
plt.plot(epochs, val_acc, marker='.', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Classification accuracy')
plt.legend(loc='lower right')
plt.ylim(0, 1);