# State of the Union - Part 2 - Generate Text

## Configuration

In [None]:
## clinton
MODEL_NAME = 'sotu-clinton'
MODEL_VERSION = '2'
MAX_SEQUENCE_LEN = 284 # the following is obtained from the training

## obama
# MODEL_NAME = 'sotu-obama'
# MODEL_VERSION = '1'
# MAX_SEQUENCE_LEN = 129 # the following is obtained from the training

## trump
# MODEL_NAME = 'sotu-trump'
# MODEL_VERSION = '1'
# MAX_SEQUENCE_LEN = 157 # the following is obtained from the training

In [None]:
## calculated from above
VOCAB_FILE = 'tokenizer-vocabulary/' + MODEL_NAME + '-vocab.json'
MODEL_FILE = 'models/'  + MODEL_NAME + '-model-' + MODEL_VERSION + '.h5'

In [3]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
from tensorflow import keras
print ('tensorflow version :', tf.__version__)
tf.config.experimental.list_physical_devices()

tensorflow version : 2.3.0


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## TF-GPU Debug
The following block tests if TF is running on GPU.

In [4]:
## This block is to tweak TF running on GPU
## You may comment this out, if you are not using GPU

## ---- start Memory setting ----
## Ask TF not to allocate all GPU memory at once.. allocate as needed
## Without this the execution will fail with "failed to initialize algorithm" error

from tensorflow.compat.v1.keras.backend import set_session
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
sess = tf.compat.v1.Session(config=config)
set_session(sess)
## ---- end Memory setting ----

Device mapping:
/job:localhost/replica:0/task:0/device:XLA_CPU:0 -> device: XLA_CPU device
/job:localhost/replica:0/task:0/device:XLA_GPU:0 -> device: XLA_GPU device
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 2070, pci bus id: 0000:01:00.0, compute capability: 7.5



## Step - Load Tokenizer from Saved Data

In [5]:
import json 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open(VOCAB_FILE) as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

total_words = len(tokenizer.word_index) + 1
print ('tokenizer: num_uniq_words :', total_words)

## Create reverse Index for lookup
word2index = tokenizer.word_index
index2word = {v:k for (k,v) in word2index.items()}

tokenizer: num_uniq_words : 4526


In [6]:
## Basic info
from  collections import Counter
from pprint import pprint

def sample_from_dict(d, sample=10):
    import random
    
    keys = random.sample(list(d), sample)
    values = [d[k] for k in keys]
    return dict(zip(keys, values))

print ('total num words :', len(tokenizer.word_index)+1)
print ('\nSome random word mappings : ')
pprint (sample_from_dict(tokenizer.word_index))


counter = Counter(tokenizer.word_counts)
print ('\nTop-N words:')
pprint(counter.most_common(10))

total num words : 4526

Some random word mappings : 
{'advance': 724,
 'every': 45,
 'explore': 1447,
 'hearts': 1406,
 'improvement': 2987,
 'matters': 1563,
 'occupational': 2868,
 'rather': 3134,
 'teams': 4310,
 'warn': 3905}

Top-N words:
[('the', 2539),
 ('to', 2231),
 ('and', 1761),
 ('of', 1295),
 ('we', 1224),
 ('a', 934),
 ('in', 922),
 ('our', 901),
 ('that', 717),
 ('i', 674)]


In [7]:
## Doing sample lookup

# change the word to : america, congress, court, citizen
word = 'america'

idx = word2index[word]
print ('word2index["{}"] : {}'.format(word, idx))
print ('index2word [{}] : {}'.format(idx, index2word[idx]))

word2index["america"] : 38
index2word [38] : america


## Step  - Load Model

In [8]:
from tensorflow.keras.models import load_model
import os

model_size_in_bytes = os.path.getsize(MODEL_FILE)


model = load_model(MODEL_FILE)

print ("Loaded model '{}',  size = {:,.1f} MB".format(MODEL_FILE, 
                                    model_size_in_bytes / (1024*1024) ))



Loaded model 'models/sotu-clinton-model-2.h5',  size = 14.2 MB


## Generate Text

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})



## see text
seed_text = "hello america"
# seed_text = "yes we can"
# seed_text = "dear americans"


next_words = 50


text = seed_text
for i in range(next_words):
    print ('{} input text : {}'.format(i,text))
    token_list = tokenizer.texts_to_sequences([text])[0]
    print ('{} token_list: {}'.format(i, token_list))
    word_list = []
    token_list = pad_sequences([token_list], maxlen=MAX_SEQUENCE_LEN-1, padding='pre')
    #print ('{} token_list padded: {}'.format(i, token_list))
    
    prediction_softmax = model.predict(token_list, verbose=0)
    predicted_idx = [ np.argmax(p) for p in prediction_softmax][0]
    
    print ('{} predicted_idx : {}'.format(i, predicted_idx))
    output_word = index2word.get(predicted_idx, "<UNK>")
    print ('{} output_word : {}'.format(i, output_word))
    text += " " + output_word
    print ('{} output_text: {}'.format (i, text))
    print()
    
    
print('final competed text:\n', text)

0 input text : hello america
0 token_list: [1, 38]
0 predicted_idx : 6
0 output_word : we
0 output_text: hello america we

1 input text : hello america we
1 token_list: [1, 38, 6]
1 predicted_idx : 29
1 output_word : must
1 output_text: hello america we must

2 input text : hello america we must
2 token_list: [1, 38, 6, 29]
2 predicted_idx : 26
2 output_word : be
2 output_text: hello america we must be

3 input text : hello america we must be
3 token_list: [1, 38, 6, 29, 26]
3 predicted_idx : 7
3 output_word : a
3 output_text: hello america we must be a

4 input text : hello america we must be a
4 token_list: [1, 38, 6, 29, 26, 7]
4 predicted_idx : 57
4 output_word : world
4 output_text: hello america we must be a world

5 input text : hello america we must be a world
5 token_list: [1, 38, 6, 29, 26, 7, 57]
5 predicted_idx : 4
5 output_word : and
5 output_text: hello america we must be a world and

6 input text : hello america we must be a world and
6 token_list: [1, 38, 6, 29, 26, 7, 