Author: Yan Pan

This is the second part of TF/Keras cheatsheets, mainly collects recurrent models in language processing and time series prediction. For detailed data source, check Kaggle link above.

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras import layers
from keras.models import Sequential
from keras.preprocessing import image

opt_verbose = 1 if 'runtime' in get_ipython().config.IPKernelApp.connection_file else 2
max_epochs = 2 if 'runtime' in get_ipython().config.IPKernelApp.connection_file else 15

Training will be stopped once accuracy reaches 97%, and epochs are limited to max 15.

In [None]:
class accCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.97):
      print("--> Callback: Reached 97% accuracy. Cheers and relax! <--")
      self.model.stop_training = True

accCallbacks = accCallback()

# NLP: Sentiment Analysis

Use in-memory dataset IMDB review. The dataset is also available from tensorflow_datasets

Keywords: NLP, recurrent neural network, RNN, tokenize, embedding, Long Short Term Memory, LSTM,Gated Recurring Units, GRU

Reference: [Colab1](https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%203%20-%20Lesson%201c.ipynb) | [Colab2](https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/NLP%20Course%20-%20Week%203%20Exercise%20Answer.ipynb) | [Colab3](https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%203%20-%20Lesson%201b.ipynb)

In [None]:
imdb = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
imdb_text= imdb['review']
imdb_label = imdb['sentiment'].map(lambda x: 1 if x=='positive' else 0)
len(imdb_text), len(imdb_label)

## Tokenizer and padding sequences

Padded sequences can be tuned to smaller size using `maxlen`, the longest will be used if default (which is not necessary)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(imdb_text)
imdb_sequences = tokenizer.texts_to_sequences(imdb_text)
imdb_padded = pad_sequences(imdb_sequences, maxlen=1000)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1 # note for +1
vocab_size

## Embedding, Conv, GRU or LSTM?

It is very likely to get over-fitting with LSTM (or generally in NLP). Remeber Embedding size needs +1.

In [None]:
model1 = Sequential([
    layers.Embedding(vocab_size, 64),
    layers.Dropout(0.2),
    layers.Conv1D(128, 5, activation='relu'),
    layers.MaxPooling1D(pool_size=4),
    # LSTM or GRU? This model seems to prefere GRU
    # layers.LSTM(64),
    layers.Bidirectional(layers.GRU(64)),
    layers.Dense(1, activation='sigmoid')
])

model1.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
hisfit1=model1.fit(
    imdb_padded, 
    imdb_label,
    epochs=max_epochs, 
    validation_split=0.2,
    callbacks=[accCallbacks], 
    verbose=opt_verbose,
)

In [None]:
mytext = ["The movie is really crappy. It lacks contents and the story is old-fashioned", 
          "A reasonably attractive storyline. Scenes are nicely crafted and the ending is ok"]
x = tokenizer.texts_to_sequences(mytext)
x = pad_sequences(sequences=x, maxlen=len(imdb_padded[0]))
model1.predict(x)

# Pretrained Embeddings - GloVe

Example to use [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/) from Stanford, For simplicity and performance, the Kaggle copy is used as input.

> Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.

The tokenized and padded sequences are resued from above section.

References: [Colab1](https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%203%20-%20Lesson%202d.ipynb#scrollTo=5NEpdhb8AxID)

## Processing GloVe txt data and mapped to tokenized sequences

The tokenized and padded sequences are resued from above section.

In [None]:
embedding_dim = 100; # depends on which GloVe is used
embeddings_index = {};
with open('../input/glove6b/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;
        
        
f"the size should be equal to vocab {len(embeddings_matrix)} / {vocab_size}"

## Add non-trainable embedding layer

In [None]:
model2 = Sequential([
    layers.Embedding(vocab_size, embedding_dim, input_length=1000, weights=[embeddings_matrix], trainable=False),
    layers.Dropout(0.2),
    layers.Conv1D(64, 5, activation='relu'),
    layers.MaxPooling1D(pool_size=4),
    layers.LSTM(64),
    layers.Dense(1, activation='sigmoid')
])

model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

hisfit2=model2.fit(
    imdb_padded, 
    imdb_label,
    epochs=max_epochs, 
    validation_split=0.2,
    callbacks=[accCallbacks], 
    verbose=opt_verbose,
)

# NLP: predict next words

This is a toy example with very small training dataset. Blackpink Ice Cream.

Reference: [Colab1](https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%204%20-%20Lesson%201%20-%20Notebook.ipynb#scrollTo=Atey4zDdR0_C) | [Colab2](https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%204%20-%20Lesson%202%20-%20Notebook.ipynb#scrollTo=6Vc6PHgxa6Hm) | [Colab3](https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/NLP_Week4_Exercise_Shakespeare_Answer.ipynb)

In [None]:
ice_cream = "Come a little closer 'cause you looking thirsty\nI'ma make it better, sip it like a Slurpee\nSnow cone chilly\nGet it free like Willy\nIn the jeans like Billie\nYou be poppin' like a wheelie\nEven in the sun, you know I keep it icy\nYou could take a lick but it's too cold to bite me\nBrr, brr, frozen\nYou're the one been chosen\nPlay the part like Moses\nKeep it fresh like roses (oh)\nLook so good yeah, look so sweet (hey)\nLookin' good enough to eat\nColdest with the kiss, so he call me ice cream\nCatch me in the fridge, right where the ice be\nLook so good yeah, look so sweet (hey)\nBaby, you deserve a treat\nDiamonds on my wrist, so he call me ice cream\nYou can double dip 'cause I know you like me\nIce cream, chillin', chillin'\nIce cream, chillin'\nIce cream, chillin', chillin'\nIce cream, chillin'\nI know that my heart can be so cold\nBut I'm sweet for you, come put me in a cone\nYou're the only touch, yeah, that get me meltin'\nHe's my favorite flavor, always gonna pick him\nYou're the cherry piece, just stay on top of me, so\nI can't see nobody else for me, no\nGet it, flip it, scoop it\nDo it like that, ah yeah ah yeah\nLike it, love it, lick it\nDo it like la la la, oh yeah\nLook so good, yeah, look so sweet (hey)\nLookin' good enough to eat\nColdest with the kiss, so he call me ice cream\nCatch me in the fridge, right where the ice be\nLook so good, yeah, look so sweet (hey)\nBaby, you deserve a treat\nDiamonds on my wrist, so he call me ice cream\nYou can double dip 'cause I know you like me\nIce cream, chillin', chillin'\nIce cream, chillin'\nIce cream, chillin', chillin'\nIce cream, chillin'\nIce cream, chillin', chillin'\nIce cream, chillin'\nIce cream, chillin', chillin'\nIce cream\nKeep it movin' like my lease up\nThink you fly, boy, where your visa?\nMona Lisa kinda Lisa\nNeeds an ice cream man that treats her\nKeep it movin' like my lease up\nThink you fly, boy, where your visa?\nMona Lisa kinda Lisa\nNeeds an ice cream man that treats her (hey)\nNa na na na na\nNa na na na na (hey)\nIce on my wrist, yeah, I like it like this\nGet the bag with the cream\nIf you know what I mean\nIce cream, ice cream\nIce cream, chillin'\nNa na na na na\nNa na na na na (hey)\nIce on my wrist, yeah, I like it like this\nAnd I'm nice with the cream\nIf you know what I mean\nIce cream, ice cream\nIce cream"
ice_cream = ice_cream.lower().split('\n')

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(ice_cream)
vocab_size = len(tokenizer.word_index) + 1 #added one!

ice_cream_seq = tokenizer.texts_to_sequences(ice_cream)
ice_cream_pad = pad_sequences(ice_cream_seq, padding='pre')

f"Vocab size={vocab_size}; Padded sequence maxlen={len(ice_cream_pad[0])}"

## Determining features and labels

In [None]:
xs, labels = np.asarray(ice_cream_pad[:,:-1]), ice_cream_pad[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes = vocab_size)

In [None]:
model3=Sequential([
    layers.Embedding(vocab_size, 64),
    layers.Bidirectional(layers.LSTM(20)),
    layers.Dense(vocab_size, activation='softmax')
])  

model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# output is too large, use plot instead of verbose print
hisfit3 = model3.fit(xs, ys, epochs=500, verbose=0)
plt.plot(hisfit3.history['accuracy'])


## Serving Prediction

In [None]:
seed_text = "This is Blackpink Ice Cream"
  
for _ in range(20):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=len(ice_cream_pad[0])-1)
    predicted = model3.predict(token_list, verbose=0)
    predicted = [np.argmax(predicted)]
    output_word = tokenizer.sequences_to_texts([predicted])
    seed_text += " " + output_word[0]
print(seed_text)


# Time Series

A simulated data with [0,1,2,3,...,9]; five windows are taken each has [n+1,n+2,...,n+5], and set the first 4 to be feature, last to be label. Finally, it is shuffled

In [None]:
dataset = tf.data.Dataset.range(10)
dataset = dataset.window(5, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(5))
dataset = dataset.map(lambda window: (window[:-1], window[-1:]))
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.batch(2).prefetch(1)
[print(f"x={x.numpy()}\ny={y.numpy()}") for x,y in dataset]