In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) # char vs word level encoding
tokenizer.fit_on_texts([shakespeare_text])

tokenizer.texts_to_sequences(["First"])
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])
max_id = len(tokenizer.word_index)
dataset_size = tokenizer.document_count

# encoding the full text so each char is mapped to an id

[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) -1

train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

# cut into smaller sequences to feed the net for training

n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

# flatten list of lists to get ready to feed to net
dataset = dataset.flat_map(lambda window: window.batch(window_length)) # creating a list of tensors

# shuffle

batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

#encoding

dataset = dataset.map(lambda x_batch, y_batch : (tf.one_hot(x_batch, depth=max_id), y_batch))
dataset = dataset.prefetch(1)

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=20)

In [None]:
def preprocess(texts):
    x = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(x, max_id)

x_new = preprocess(["How are yo"])
y_pred = model.predict_classes(x_new)
tokenizer.sequences_to_texts(y_pred + 1)[0][-1]

def next_char(text, temperature=1):
    x_new = preprocess([text])
    y_proba = model.predict(x_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

print(complete_text("t", temperature=0.2))
print(complete_text("w", temperature=1))
print(complete_text("w", temperature=2))

In [None]:
# previously were all stateless, they don't save their state after processing
# now moving to stateful, must use sequential and non overlapping sequences

dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True) #vs earlier where shift was 1
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1) # batch with a single window to avoid overlap/consecutive problem

model = keras.models.Sequential([
    keras.layers.GRU(128, 
                     return_sequences=True, 
                     stateful=True, 
                     dropout=0.2,
                     recurrent_dropout=0.2, 
                     batch_input_shape=[batch_size, None, max_id]
                    ),
    keras.layers.GRU(128, 
                     return_sequences=True, 
                     stateful=True, 
                     dropout=0.2,
                     recurrent_dropout=0.2
                    ),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])

# need to reset the states for each epoch before we go back to the beginning of the text

class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback()])

In [None]:
# sentiment

# loading the data
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.get_word_index()

# visualizing the words
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in x_train[0][:10]])

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

# formatting the data

def preprocess(x_batch, y_batch):
    x_batch = tf.strings.substr(x_batch, 0, 300)
    x_batch = tf.strings.regex_replace(x_batch, b"<br\\s*/?>", b" ")
    x_batch = tf.strings.regex_replace(x_batch, b"[^a-zA-Z]", b" ")
    x_batch = tf.strings.split(x_batch)
    return x_batch.to_tensor(default_value=b"<pad>"), y_batch

from collections import Counter

# making a vocabulary

vocabulary = Counter()
for x_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in x_batch:
        vocabulary.update(list(review.numpy()))

# we only need 10000 most common

vocab_size = 10000
truncated_vocabulary =[ word for word, count in vocabulary.most_common()[:vocab_size] ]

# replace words with id using pre process function

words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

def encode_words(x_batch, y_batch):
    return table.lookup(x_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

embed_size = 128
model = keras.models.Sequential([ # mask_zero = True teaches it to ignore the padding tokens, aka the zero index
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, mask_zero=True, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True, mask_zero=True),
    keras.layers.GRU(128, mask_zero=True),
    keras.layers.Dense(1, activation="sigmoid", mask_zero=True)
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

In [None]:
# another masking example

K = keras.backend
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs,0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.Model(inputs=[inputs], outputs=[outputs])

In [None]:
# pre trained model, can cache by setting environment variable "TFHUB_CACHE_DIR"

import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", # sentence encoder
                   dtype=tf.string, input_shape=[], output_shape=[50]),    # parses each string and makes 
    keras.layers.Dense(128, activation="relu"),                            # a matrix of the word 
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


# load the imdb reviews data

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples
batch_size = 32
train_set = datasets["train"].batch(batch_size).prefetch(1)
history = model.fit(train_set, epochs=5)

In [None]:
# machine translation with encoder-decoder

import tensorflow_addons as tfa

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSample()

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler, output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings,
    initial_state=encoder_state,
    sequence_length=sequence_lengths
)
y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.Model(inputs=[encoder_inputs, decoder_inputs, sequence_lengths], outputs=[y_proba])

In [None]:
# bidirectional nn, predict forwards and backwards
# just add this layer to the model

keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))

In [None]:
# beam search run n = beam_width copies of the model in parallel to predict the best sequence
# sub this decoder in for the basic decoder above

beam_width = 10
decoder = tfa.seq2seq.beam_search_decoder(
    cell = decoder_cell,
    beam_width = beam_width,
    output_layer = output_layer
)
decoder_initial_state = tfa.seq2seq.beam_search_decoder.tile_batch(
    encoder_state,
    multiplier=beam_width
)
ouputs, _, _ = decoder(
    embedding_decoder,
    start_tokens=start_tokens,
    end_token=end_token, # token that signals start of sentence
    initial_state=decoder_initial_state # token that is the end of the sentence
)

In [None]:
# adding luong attention to an encoder-decoder
# use for long sequences so the model doesn't forget important tokens

attention_mechanisms = tfa.seq2seq.attention_wrapper.LuongAttention(
    units,
    encoder_state,
    memory_sequence_length=encoder_sequence_length
)
attention_decoder_cell = tfa.seq2seq.attention_wrapper.AttentionWrapper(
    decoder_cell,
    attention_mechanism,
    attention_layer_size=n_units
)

In [None]:
# positional encoding

class PositionalEncoding(keras.layers.layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        if max_dims % 2 == 1: max_dims +=1
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        pos_emb = np.empty((1, max_steps, max_dims))
        pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]
    
# creating the first layers of the transformer/attention architechture

embed_size = 512; max_steps = 500; vocab_size = 10000
encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
embeddings = keras.layers.Embeddings(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)
positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)
encoder_in = positional_encoding(encoder_embeddings)
decoder_in = positional_encoding(decoder_embeddings)

In [None]:
# using keras attention layers

z = encoder_in
for N in range(6):
    z = keras.layers.Attention(use_scale=True)([z,z])

encoder_outputs = z
z = decoder_in
for N in range(6):
    z = keras.layers.Attention(use_scale=True, casual=True)([z,z])
    z = keras.layers.Attention(use_scale=True)([z, encoder_outputs])
    
outputs = keras.layers.TimeDistributed(keras.layers.Dense(vocab_size, activation="softmax"))(z)