## Using Tensorflow for Natural Language processing

In [27]:
# system libraries
import os

In [28]:
# data preprocessing libraries
import pandas as pd
import numpy as np

In [29]:
# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
%matplotlib inline
sns.set_style('darkgrid')

In [31]:
#tensorflow library
import tensorflow as tf

### Step 1. Reading the Text data

In [32]:
# reading the data
DATA_DIR = os.path.join( os.curdir, '..', 'data')
text_data_path = os.path.join(DATA_DIR, 'tensorflow', 'shakespeare.txt')
text = open(text_data_path, 'r').read()

In [33]:
print(text[:550])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st w


In [34]:
# find all unique characters in the text
total_chars = sorted(set(text))
" ".join(total_chars)

'\n   ! " & \' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; < > ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z | }'

In [35]:
len(total_chars) # need to take care of this number while designing the Dense layer

84

### Step 2: Text processing and Vectorization

In [37]:
char_to_index = {char:index for index, char in enumerate(total_chars)}

In [38]:
char_to_index

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '&': 4,
 "'": 5,
 '(': 6,
 ')': 7,
 ',': 8,
 '-': 9,
 '.': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 ':': 21,
 ';': 22,
 '<': 23,
 '>': 24,
 '?': 25,
 'A': 26,
 'B': 27,
 'C': 28,
 'D': 29,
 'E': 30,
 'F': 31,
 'G': 32,
 'H': 33,
 'I': 34,
 'J': 35,
 'K': 36,
 'L': 37,
 'M': 38,
 'N': 39,
 'O': 40,
 'P': 41,
 'Q': 42,
 'R': 43,
 'S': 44,
 'T': 45,
 'U': 46,
 'V': 47,
 'W': 48,
 'X': 49,
 'Y': 50,
 'Z': 51,
 '[': 52,
 ']': 53,
 '_': 54,
 '`': 55,
 'a': 56,
 'b': 57,
 'c': 58,
 'd': 59,
 'e': 60,
 'f': 61,
 'g': 62,
 'h': 63,
 'i': 64,
 'j': 65,
 'k': 66,
 'l': 67,
 'm': 68,
 'n': 69,
 'o': 70,
 'p': 71,
 'q': 72,
 'r': 73,
 's': 74,
 't': 75,
 'u': 76,
 'v': 77,
 'w': 78,
 'x': 79,
 'y': 80,
 'z': 81,
 '|': 82,
 '}': 83}

In [39]:
char_to_index['H']

33

In [40]:
index_to_char = np.array(total_chars)

In [41]:
index_to_char[33]

'H'

In [42]:
encoded_text = np.array([char_to_index[c] for c in text])

In [43]:
encoded_text

array([ 0,  1,  1, ..., 30, 39, 29])

In [44]:
len(encoded_text)

5445609

In [45]:
sample = text[:500]

In [46]:
sample

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

In [47]:
encoded_text[:500]

array([ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1, 12,  0,  1,  1, 31, 73, 70, 68,  1, 61, 56, 64,
       73, 60, 74, 75,  1, 58, 73, 60, 56, 75, 76, 73, 60, 74,  1, 78, 60,
        1, 59, 60, 74, 64, 73, 60,  1, 64, 69, 58, 73, 60, 56, 74, 60,  8,
        0,  1,  1, 45, 63, 56, 75,  1, 75, 63, 60, 73, 60, 57, 80,  1, 57,
       60, 56, 76, 75, 80,  5, 74,  1, 73, 70, 74, 60,  1, 68, 64, 62, 63,
       75,  1, 69, 60, 77, 60, 73,  1, 59, 64, 60,  8,  0,  1,  1, 27, 76,
       75,  1, 56, 74,  1, 75, 63, 60,  1, 73, 64, 71, 60, 73,  1, 74, 63,
       70, 76, 67, 59,  1, 57, 80,  1, 75, 64, 68, 60,  1, 59, 60, 58, 60,
       56, 74, 60,  8,  0,  1,  1, 33, 64, 74,  1, 75, 60, 69, 59, 60, 73,
        1, 63, 60, 64, 73,  1, 68, 64, 62, 63, 75,  1, 57, 60, 56, 73,  1,
       63, 64, 74,  1, 68, 60, 68, 70, 73, 80, 21,  0,  1,  1, 27, 76, 75,
        1, 75, 63, 70, 76,  1, 58, 70, 69, 75, 73, 56, 58, 75, 60, 59,  1,
       75, 70,  1, 75, 63

### Step 3: Creating batches

1. Understand text sequences
2. Use tensorflow datasets to generate batches
3. Shuffle batches

In [48]:
seq_len = 120

In [50]:
total_num_seq = len(text) // (seq_len+1)

In [51]:
total_num_seq

45005

In [52]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

In [53]:
type(char_dataset)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [57]:
# for item in char_dataset.take(500):
#     print(index_to_char[item.numpy()])

In [58]:
sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

In [60]:
def create_seq_targets(seq):
    input_txt = seq[:-1] # Hello my nam 'e' missing
    target_txt = seq[1:] # ello my name 'H' missing
    return input_txt, target_txt

In [61]:
dataset = sequences.map(create_seq_targets)

In [66]:
for input_txt, target_txt in dataset.take(1):
    print(input_txt.numpy())
    print("".join(index_to_char[input_txt.numpy()]))
    print('\n')
    print(target_txt.numpy())
    print("".join(index_to_char[target_txt.numpy()]))

[ 0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0
  1  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74
  1 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45
 63 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74
 60  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75]

                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But


[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  0  1
  1 31 73 70 68  1 61 56 64 73 60 74 75  1 58 73 60 56 75 76 73 60 74  1
 78 60  1 59 60 74 64 73 60  1 64 69 58 73 60 56 74 60  8  0  1  1 45 63
 56 75  1 75 63 60 73 60 57 80  1 57 60 56 76 75 80  5 74  1 73 70 74 60
  1 68 64 62 63 75  1 69 60 77 60 73  1 59 64 60  8  0  1  1 27 76 75  1]
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But 


In [67]:
batch_size = 128

In [69]:
buffer_size=10000
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [70]:
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

### Step 4: Creating Model

1. Setup loss function
2. Create Model
    * Embedding
    * GRU
    * Dense

In [77]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [71]:
vocab_size = len(total_chars)

In [72]:
vocab_size

84

In [73]:
embed_dim = 64

In [74]:
rnn_neurons = 1026

In [75]:
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [76]:
def sparse_cat_loss(y_true, y_pred):
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [84]:
def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_dim, batch_input_shape=[
        batch_size, None
    ]))
    model.add(GRU(rnn_neurons, 
                 return_sequences=True, 
                 stateful=True,
                 recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    model.compile('adam', loss=sparse_cat_loss)
    return model

In [85]:
model = create_model(vocab_size=vocab_size,
                    embed_dim=embed_dim,
                    rnn_neurons=rnn_neurons,
                    batch_size=batch_size)

In [86]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (128, None, 64)           5376      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 84)           86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


### Step 5: Training model

In [87]:
MODEL_PATH = os.path.join( os.curdir, '..', 'model')

In [88]:
os.makedirs(MODEL_PATH, exist_ok=True)

In [89]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)

In [93]:
example_batch_predictions.shape

TensorShape([128, 120, 84])

In [96]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [97]:
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [98]:
sampled_indices

array([83, 17, 52, 41, 17, 62, 76,  6, 25, 76, 14, 29,  9, 81, 12, 54, 61,
       62, 61, 61, 17, 45, 14, 40,  8, 21, 69, 49,  0, 43,  0, 16, 61,  7,
       46,  7, 24, 35, 61, 20, 66, 72,  3, 43, 15,  3, 63, 65,  2, 77, 11,
       78, 37, 46, 25, 37, 53, 69, 79, 52, 30, 32, 32, 78, 47, 83, 46, 39,
       20, 63, 65, 53, 79, 25,  4, 78,  3,  6, 48, 29, 57, 67, 79,  1, 11,
       42, 77, 30,  1,  4, 46,  3, 49, 33, 78, 49, 52, 27, 38, 41, 64, 18,
       45,  0, 13, 20, 81, 72, 76, 52, 25,  6,  7,  2, 56,  9, 75, 62, 68,
        3])

In [101]:
index_to_char[sampled_indices]

array(['}', '6', '[', 'P', '6', 'g', 'u', '(', '?', 'u', '3', 'D', '-',
       'z', '1', '_', 'f', 'g', 'f', 'f', '6', 'T', '3', 'O', ',', ':',
       'n', 'X', '\n', 'R', '\n', '5', 'f', ')', 'U', ')', '>', 'J', 'f',
       '9', 'k', 'q', '"', 'R', '4', '"', 'h', 'j', '!', 'v', '0', 'w',
       'L', 'U', '?', 'L', ']', 'n', 'x', '[', 'E', 'G', 'G', 'w', 'V',
       '}', 'U', 'N', '9', 'h', 'j', ']', 'x', '?', '&', 'w', '"', '(',
       'W', 'D', 'b', 'l', 'x', ' ', '0', 'Q', 'v', 'E', ' ', '&', 'U',
       '"', 'X', 'H', 'w', 'X', '[', 'B', 'M', 'P', 'i', '7', 'T', '\n',
       '2', '9', 'z', 'q', 'u', '[', '?', '(', ')', '!', 'a', '-', 't',
       'g', 'm', '"'], dtype='<U1')

In [102]:
"".join(index_to_char[sampled_indices])

'}6[P6gu(?u3D-z1_fgff6T3O,:nX\nR\n5f)U)>Jf9kq"R4"hj!v0wLU?L]nx[EGGwV}UN9hj]x?&w"(WDblx 0QvE &U"XHwX[BMPi7T\n29zqu[?()!a-tgm"'

In [104]:
epochs = 30

In [107]:
# model.fit(dataset, epochs=epochs)

In [106]:
from tensorflow.keras.models import load_model, save_model

In [108]:
model_file = os.path.join(MODEL_PATH, 'shakespeare_gen.h5')

In [110]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)
model.load_weights(model_file)
model.build(tf.TensorShape([1,None]))

In [111]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1026)           3361176   
_________________________________________________________________
dense_2 (Dense)              (1, None, 84)             86268     
Total params: 3,452,820
Trainable params: 3,452,820
Non-trainable params: 0
_________________________________________________________________


In [116]:
def generate_text(model, start_seed, gen_size=500, temp=1.0):
    num_generate = gen_size
    input_eval = [char_to_index[s] for s in start_seed]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []
    temperature = temp
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions/temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(index_to_char[predicted_id])
    return (start_seed+"".join(text_generated))

In [122]:
print(generate_text(model, "where should I go?", gen_size=10))

where should I go?
  MARIA. 
