<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/jflanigan/handson-ml2/blob/master/16_nlp_with_rnns_and_attention.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0.

In [1]:
# Ryan McCrory
# Assignment 2
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)


# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tensorflow-addons
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
#import tensorflow_datasets as tfds
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.test.is_gpu_available():
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

TensorFlow 2.x selected.
[K     |████████████████████████████████| 1.0MB 2.7MB/s 
[31mERROR: tensorflow-federated 0.11.0 requires enum34~=1.1, which is not installed.[0m
[31mERROR: tensorflow-federated 0.11.0 has requirement attrs~=18.2, but you'll have attrs 19.3.0 which is incompatible.[0m
[31mERROR: tensorflow-federated 0.11.0 has requirement cachetools~=3.1.1, but you'll have cachetools 4.0.0 which is incompatible.[0m
[31mERROR: tensorflow-federated 0.11.0 has requirement grpcio~=1.24.3, but you'll have grpcio 1.27.1 which is incompatible.[0m
[31mERROR: tensorflow-federated 0.11.0 has requirement tensorflow~=2.0.0, but you'll have tensorflow 2.1.0 which is incompatible.[0m
[31mERROR: tensorflow-federated 0.11.0 has requirement tensorflow-addons~=0.6.0, but you'll have tensorflow-addons 0.8.2 which is incompatible.[0m
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
No GPU was detected. LSTMs and CNNs can be very slow without a GPU.
Go to

# Sentiment Analysis

In [0]:
tf.random.set_seed(42)

You can load the IMDB dataset easily:

In [3]:
import tensorflow_datasets as tfds

datasets = {}

# Split the training set into 60% and 40%, so we'll end up with 15,000 examples
# for training, 10,000 examples for validation and 25,000 examples for testing.
train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

datasets["train"] = train_data
datasets["validation"] = validation_data
datasets["test"] = test_data

[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…






HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete8SX6VM/imdb_reviews-train.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete8SX6VM/imdb_reviews-test.tfrecord


HBox(children=(IntProgress(value=0, max=25000), HTML(value='')))



HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete8SX6VM/imdb_reviews-unsupervised.tfrecord


HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [4]:
datasets.keys()


dict_keys(['train', 'validation', 'test'])

In [0]:
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

In [0]:
# This is the preprocess function
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [65]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

# TRAINING BATCH SIZE, 

*   MUST BE CHANGED BEFORE RUNNING PART 2 AND 3

In [0]:
from collections import Counter

# fill our vocabulary from the training set
vocabulary = Counter()          #Batch size was 32 by default
                                #For the SimpleRNN model, set batch size to 128
                                #For the GRU model, set batch size to 512
                                #For the GRU model with pretrained embeddings, set the batch size to 16
for X_batch, y_batch in datasets["train"].batch(128).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [10]:
vocabulary.most_common()[:3]

[(b'<pad>', 110696), (b'the', 36691), (b'a', 22997)]

In [11]:
print(len(vocabulary))
vocabulary_size = len(vocabulary)

41624


In [0]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [13]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This movie was faaaaaantastic".split():
    print(word_to_id.get(word) or vocab_size)

22
12
11
10000


In [0]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [15]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [0]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

# preprocess our train set
train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

# preprocess our validation set
validation_set = datasets["validation"].batch(32).map(preprocess)
validation_set = validation_set.map(encode_words).prefetch(1)

# preprocess our test set
test_set = datasets["test"].batch(32).map(preprocess)
test_set = test_set.map(encode_words).prefetch(1)

In [17]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

for X_batch, y_batch in validation_set.take(1):
    print(X_batch)
    print(y_batch)

for X_batch, y_batch in test_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   71 ...    0    0    0]
 [3278 6289    1 ...    0    0    0]
 ...
 [  22   12  120 ...  332 1030    0]
 [1810 3594  490 ...    0    0    0]
 [2997 5393    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)
tf.Tensor(
[[   22    11     2 ...     0     0     0]
 [ 2963  1409  6707 ...     0     0     0]
 [  570 10374  2486 ...     0     0     0]
 ...
 [ 1991     9    12 ...     0     0     0]
 [   22    39     5 ...    49    71  3404]
 [  133    75     6 ...     0     0     0]], shape=(32, 63), dtype=int64)
tf.Tensor([1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 1 0 1], shape=(32,), dtype=int64)
tf.Tensor(
[[  136    26    79 ...     0     0     0]
 [   73 10791   731 ...     0     0     0]
 [ 3242   745 10210 ...     0     0     0]
 ...
 [ 5507  8165  7329 ...     0     0     0]
 [  275     6    21 ...     0   

# *PART* 1

In [73]:
# Part 1.1
# Note, to get best results, change the training batch size to 128

embed_size = 128
model = keras.models.Sequential([
    # 1. Embed the input text as a sequence of vectors.
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    # 2. Transform the sequence of embeddings into a vector using a single-layer, simple RNN
    keras.layers.SimpleRNN(128, return_sequences=True, dropout=0.4),
    # 3. Apply a feed-forward layer on that vector to obtain a label.
    keras.layers.SimpleRNN(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="sgd", metrics=["accuracy"])

history = model.fit(train_set, epochs=5, validation_data=validation_set)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
# Part 1.2

# evaluate model on test data
results = model.evaluate(test_set, verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

# *PART 2*

In [0]:
# Part 2, using GRU's

embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True, dropout=0.4),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
history = model.fit(train_set, epochs=3, validation_data=validation_set)

In [0]:
# Part 2

# evaluate model on test data
results = model.evaluate(test_set, verbose=2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

# PART 3 


*   Some Code from https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa


*   Pre Trained Embedding from : https://nlp.stanford.edu/projects/glove





In [18]:
# Import Libraries

# Keras
import tensorflow as tf
#import tensorflow_datasets as tfds
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
## Plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import tensorflow_datasets as tfds

Using TensorFlow backend.


In [0]:
# Extract word embeddings from the Glove

embeddings_index = dict()
f = open('/content/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [0]:
# Create a weight matrix
# matrix is correct, I double checked

# To choose the vocabulary size, and load the imdb data with that vocabulary_size:
imdb = keras.datasets.imdb
vocabulary_size = 100000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)

# To create the embedding matrix:
word2id = imdb.get_word_index()   # dictionary from words to integers (the id of the word in the vocab)
id2word = {i: word for word, i in word2id.items()}
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in word2id.items():
   # print(word2id)
   if index > vocabulary_size - 1:
       continue
   else:
       embedding_vector = embeddings_index.get(word)
       # print("embedding")
       # print(embedding_vector)
       if embedding_vector is not None:
           embedding_matrix[index] = embedding_vector

print(len(embedding_matrix))
for i in range(0, 3):
   print("The glove embedding for '{}' is {} ".format(list(word2id.keys())[i], embedding_matrix[i]))

In [0]:
# Below is for Part 3.4

count = -1
for x in word2id.keys():
  count += 1
  if x == 'good':
    good_at = count
    break
  
count = -1
for x in word2id.keys():
  count += 1
  if x == 'bad':
    bad_at = count
    break
  
print("The glove embedding for '{}' is {} ".format(list(word2id.keys())[good_at], embedding_matrix[good_at]))
print("The glove embedding for '{}' is {} ".format(list(word2id.keys())[bad_at], embedding_matrix[bad_at]))

In [38]:
# Part 3

model_glove = keras.models.Sequential([                                  
    keras.layers.Embedding(vocabulary_size, 100,
                           weights=[embedding_matrix],
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(100, return_sequences=True, dropout=0.2),
    keras.layers.GRU(100),
    keras.layers.Dense(1, activation="sigmoid")
])
model_glove.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])
history = model_glove.fit(train_set, epochs=4, validation_data=validation_set, verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [0]:
# Part 3.4, See if substituting antonyms changes prediction on example sentence
  # Note, I could not get this section to work properly

# example sentences
ex1 = "i can not believe i have a good grade" 
ex2 = "i can not believe i have a bad grade"

# preprocess ex1 and ex2
#nmpy1 = preprocess(ex1, y_batch)
#nmpy2 = preprocess(ex2, y_batch)

# turn sentences into numpy arrays
# nmpy1 = nmpy1.numpy()
# nmpy2 = nmpy2.numpy()

# predict sentiment
#model_glove.predict(nmpy1)
#model_glove.predict(nmpy2)

In [72]:
# Part 3 evaulate on test set

# evaluate model on test data
results = model_glove.evaluate(test_set, verbose=2)

for name, value in zip(model_glove.metrics_names, results):
  print("%s: %.3f" % (name, value))

loss: 0.546
accuracy: 0.745
