In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist

# Sklearn
from sklearn.model_selection import train_test_split

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

%matplotlib inline

In [2]:
print("TensorFlow version: ", tf.__version__)
print("Tensorflow keras version: ", tf.keras.__version__)

TensorFlow version:  1.13.1
Tensorflow keras version:  2.2.4-tf


In [None]:
# Load the datasets
data_text = x_train_text + x_test_text

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [None]:
x_train_text[1]

In [None]:
y_train[1]

### Tokenizer

In [None]:
# We may instruct the tokenizer to only use 
# e.g. the 10000 most popular words from the data-set.
num_words = 10000

tokenizer = Tokenizer(num_words=num_words)

In [None]:
%%time
tokenizer.fit_on_texts(data_text)

In [None]:
if num_words is None:
    num_words = len(tokenizer.word_index)

In [None]:
tokenizer.word_index

In [None]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [None]:
x_train_text[1]

In [None]:
np.array(x_train_tokens[1])

In [None]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

### Padding and Truncating Data

In [None]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [None]:
np.mean(num_tokens)

In [None]:
np.max(num_tokens)

In [None]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

In [None]:
# Calculate How much overs for the dataset
np.sum(num_tokens < max_tokens) / len(num_tokens)

In [None]:
pad = 'pre'

x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [None]:
x_train_pad.shape

In [None]:
x_test_pad.shape

In [None]:
np.array(x_train_tokens[1])

In [None]:
x_train_pad[1]

### Tokenizer Inverse Map

In [None]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [None]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [None]:
x_train_text[1]

In [None]:
tokens_to_string(x_train_tokens[1])

### Create the Recurrent Neural Network

In [None]:
model = Sequential()

In [None]:
embedding_size = 8

In [None]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))

In [None]:
model.add(GRU(units=16, return_sequences=True))

In [None]:
model.add(GRU(units=8, return_sequences=True))

In [None]:
model.add(GRU(units=4))

In [None]:
model.add(Dense(1, activation='sigmoid'))

In [None]:
optimizer = Adam(lr=1e-3)

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
model.summary()

### Train the Recurrent Neural Network

In [None]:
%%time
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=3, batch_size=64)

### Example of Mis-Classified Text

In [None]:
%%time
y_pred = model.predict(x=x_test_pad[0:1000])
y_pred = y_pred.T[0]

In [None]:
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])

In [None]:
cls_true = np.array(y_test[0:1000])

In [None]:
incorrect = np.where(cls_pred != cls_true)
incorrect = incorrect[0]

In [None]:
len(incorrect)

In [None]:
idx = incorrect[0]
idx

In [None]:
text = x_test_text[idx]
text

In [None]:
y_pred[idx]

In [None]:
cls_true[idx]

### New Data

In [None]:
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

In [None]:
tokens = tokenizer.texts_to_sequences(texts)

In [None]:
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

In [None]:
model.predict(tokens_pad)

### Embeddings

In [None]:
layer_embedding = model.get_layer('layer_embedding')

In [None]:
weights_embedding = layer_embedding.get_weights()[0]

In [None]:
weights_embedding.shape

In [None]:
token_good = tokenizer.word_index['good']
token_good

In [None]:
token_great = tokenizer.word_index['great']
token_great

In [None]:
weights_embedding[token_good]

In [None]:
weights_embedding[token_great]

In [None]:
token_bad = tokenizer.word_index['bad']
token_horrible = tokenizer.word_index['horrible']

In [None]:
weights_embedding[token_bad]

In [None]:
weights_embedding[token_horrible]

### Sorted Words

In [None]:
def print_sorted_words(word, metric='cosine'):
    """
    Print the words in the vocabulary sorted according to their
    embedding-distance to the given word.
    Different metrics can be used, e.g. 'cosine' or 'euclidean'.
    """

    # Get the token (i.e. integer ID) for the given word.
    token = tokenizer.word_index[word]

    # Get the embedding for the given word. Note that the
    # embedding-weight-matrix is indexed by the word-tokens
    # which are integer IDs.
    embedding = weights_embedding[token]

    # Calculate the distance between the embeddings for
    # this word and all other words in the vocabulary.
    distances = cdist(weights_embedding, [embedding],
                      metric=metric).T[0]
    
    # Get an index sorted according to the embedding-distances.
    # These are the tokens (integer IDs) for words in the vocabulary.
    sorted_index = np.argsort(distances)
    
    # Sort the embedding-distances.
    sorted_distances = distances[sorted_index]
    
    # Sort all the words in the vocabulary according to their
    # embedding-distance. This is a bit excessive because we
    # will only print the top and bottom words.
    sorted_words = [inverse_map[token] for token in sorted_index
                    if token != 0]

    # Helper-function for printing words and embedding-distances.
    def _print_words(words, distances):
        for word, distance in zip(words, distances):
            print("{0:.3f} - {1}".format(distance, word))

    # Number of words to print from the top and bottom of the list.
    k = 10

    print("Distance from '{0}':".format(word))

    # Print the words with smallest embedding-distance.
    _print_words(sorted_words[0:k], sorted_distances[0:k])

    print("...")

    # Print the words with highest embedding-distance.
    _print_words(sorted_words[-k:], sorted_distances[-k:])

In [None]:
print_sorted_words('great', metric='cosine')

In [None]:
print_sorted_words('worst', metric='cosine')