# Preprocessing data

In [1]:
anna = open('anna.txt', 'r')
txt = anna.read()

In [2]:
txt[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

### charcater tokenizer

 encode characters as integers. In Keras this can be done by the Tokenizer class

In [3]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


- By default, the Tokenizer class tokenizes the words in the text rather than individual characters. This can be changed by setting char_level = True

- The default tokenizer converts all alphabets to lower case, this can be changed by setting lower = False

- The default tokeniser ignores all punctuations, tabs and line breaks etc. This can be changed by passing an explicit list of characters to the keyworded argument 'filters' 



In [4]:
# we will keep all the default characters in the filter except line breaks, question marks, fullstops and exclamation marks
fltr = '"#$%&()*+,-/:;<=>@[\\]^_`{|}~\t' 

# create a tokenizer instance
# we will also require the text to be Case sensitive
tokenizer = Tokenizer(filters = fltr, lower = False, char_level = True)

 the Tokenizer should be thought of analogously to data transformers in sklearn so we first fit then to our training text and the use the fitten tokenizer to transform any given text.

In [5]:
# fit the tokenizer
# we can fit it on a list of multiple different texts
tokenizer.fit_on_texts([txt])

In [6]:
# chaecking the tokenizer indeed produces a sensible output
sample = 'How are you doing?'

tokenizer.texts_to_sequences([sample])

[[37, 5, 15, 1, 4, 10, 2, 1, 19, 5, 14, 1, 11, 5, 8, 6, 18, 35]]

In [7]:
# converting a sequence back to text
tokenizer.sequences_to_texts([[37, 5, 15, 1, 4, 10, 2, 1, 19, 5, 14, 1, 11, 5, 8, 6, 18, 35]])

['H o w   a r e   y o u   d o i n g ?']

In [8]:
# number of distinct characters in the text
print('Number of distinct characters in the text: {}'.format(len(tokenizer.word_index)))

Number of distinct characters in the text: 83


In [9]:
# total number of characters in the text
# the attribute .word_counts gives the number of times each character/word appears in the text
# it returns an ordered dictionary with the characters as its keys and their count as the corresponding value
character_count = tokenizer.word_counts

# computations on elements of a list can be done efficiently through reduce 
# for e.g. see here : https://book.pythontips.com/en/latest/map_filter.html
from functools import reduce

total_chars = reduce(lambda x, y: x+y , character_count.values())
print('total number of characters in the text: {}'.format(total_chars))

total number of characters in the text: 1985223


In [10]:
import numpy as np

In [11]:
# converting the whole text to a sequence of integers using tokenizer
encoded = np.array(tokenizer.texts_to_sequences([txt]))
encoded

array([[53,  7,  4, ...,  9, 24, 13]])

In [12]:
encoded.shape

(1, 1985223)

## Extract sequences from the text

We will now extract split the text into sequences. One way to do this is to simply to appropriately reshape the array containing the text. Note that Aurelion Geron extracts the sequences by using window() method of tf.data.Dataset class. We will learn how to do this later, but for now let's just simply reshape the arrays.

In [13]:
seq_len = 100
num_seq = encoded.shape[1]//(seq_len+1) # we will include one extra character as the target character for the last time_step in the sequence
print('expected number of sequences in the text: {}'.format(num_seq))

expected number of sequences in the text: 19655


In [14]:
sequences = encoded[:,:num_seq*(seq_len+1)].reshape(num_seq, seq_len+1,1)
sequences.shape

(19655, 101, 1)

In [15]:
# obtaining the input and target sequences 
X = sequences[:,:seq_len,:]
y = sequences[:, 1:seq_len+1, :]

In [16]:
X[0, 90:]

array([[13],
       [57],
       [25],
       [ 2],
       [10],
       [19],
       [ 3],
       [ 7],
       [ 8],
       [ 6]])

In [17]:
y[0, 90:]

array([[57],
       [25],
       [ 2],
       [10],
       [19],
       [ 3],
       [ 7],
       [ 8],
       [ 6],
       [18]])

## Split text into training and validation set

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
random_seed = 42
train_size = 0.75
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = train_size, random_state = random_seed)



In [20]:
print('shape of X_train: {}'.format(X_train.shape))
print('shape of y_train: {}'.format(y_train.shape))
print('shape of X_val: {}'.format(X_val.shape))
print('shape of y_val: {}'.format(y_val.shape))

shape of X_train: (14741, 100, 1)
shape of y_train: (14741, 100, 1)
shape of X_val: (4914, 100, 1)
shape of y_val: (4914, 100, 1)


Check that the target sequences indeed represent the next character in the input sequence.

In [21]:
X_train[200, 90:]

array([[ 4],
       [ 9],
       [ 1],
       [17],
       [14],
       [16],
       [ 7],
       [ 1],
       [14],
       [ 6]])

In [22]:
y_train[200, 90:]

array([[ 9],
       [ 1],
       [17],
       [14],
       [16],
       [ 7],
       [ 1],
       [14],
       [ 6],
       [11]])

In [23]:
X_val[371, 90:]

array([[ 7],
       [ 2],
       [ 1],
       [16],
       [ 5],
       [14],
       [ 6],
       [ 3],
       [ 8],
       [ 6]])

In [24]:
y_val[371, 90:]

array([[ 2],
       [ 1],
       [16],
       [ 5],
       [14],
       [ 6],
       [ 3],
       [ 8],
       [ 6],
       [18]])

## One-hot-encoding the characters

In [25]:
# function to given batch of sequence into a sequence of one-hot-encoded vectors of a given dimension
# here we are assuming that the input array will be a 3d array of shape (batch_size, seq_length, 1)
# we want the output to be a 3d array of shape (batch_size, seq_length, encoding_dim)

def one_hot_encode(sequence, encoding_dim):
    
    # shape of the output array
    out_shape = (sequence.size, encoding_dim)
    
    one_hot_arr = np.zeros(out_shape)
    
    one_hot_arr[np.arange(sequence.size), sequence.flatten()-1] = 1  # This is based on the fact that in numpyt, the operation
                                                                   # X[[a1, a2, ..], [b1, b2, ...]] = y
                                                                   # produces the same output as
                                                                   # X[a1,b1] =y, X[a2,b2] = y, ....
                
    # reshape the one_hot_arr to have the shape shape as sequences 
    one_hot_arr = one_hot_arr.reshape(*sequence.shape[:-1], encoding_dim)
    
    return one_hot_arr

In [26]:
# testing one_hot_encode
seq = np.array([[[1], [5], [6]],[[3],[8],[4]]])
one_hot_encode(seq, 10)

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]]])

In [27]:
# now one_hot_encoding all the sequences 
num_chars = len(tokenizer.word_index) # number of distinct characters in the text


X_train_one_hot = one_hot_encode(X_train, num_chars)
print('shape of one_hot_encoded X_train: {}'.format(X_train_one_hot.shape))
y_train_one_hot = one_hot_encode(y_train, num_chars)
print('shape of one_hot_encoded y_train: {}'.format(y_train_one_hot.shape))
X_val_one_hot = one_hot_encode(X_val, num_chars)
print('shape of one_hot_encoded X_val: {}'.format(X_val_one_hot.shape))
y_val_one_hot = one_hot_encode(y_val, num_chars)
print('shape of one_hot_encoded y_val: {}'.format(y_val_one_hot.shape))

shape of one_hot_encoded X_train: (14741, 100, 83)
shape of one_hot_encoded y_train: (14741, 100, 83)
shape of one_hot_encoded X_val: (4914, 100, 83)
shape of one_hot_encoded y_val: (4914, 100, 83)


## Creating and training a GRU cell based RNN

In [28]:
from keras.models import Sequential
from keras.layers import GRU
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [29]:
num_layers = 2
dropout = 0.2
num_units = 50

my_RNN1 = Sequential()
my_RNN1.add(GRU(units = num_units, input_shape = [None, num_chars], return_sequences = True, 
               dropout = dropout, recurrent_dropout = dropout))

for layer in range(num_layers-1):
    my_RNN1.add(GRU(units = num_units, return_sequences = True, dropout = dropout, recurrent_dropout = dropout))

my_RNN1.add(Dense(units = num_chars, activation = 'softmax' )) # note that we can directly apply softmax activation to the Dense layer instead of addinga softmax layer on top of it

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [30]:
# compiling the model
optimizer = 'adam'
loss = 'categorical_crossentropy' # since we one_hot_encoded our target values, we will use categorical_crossentropy 
                                  # instead of sparse_categorical_crossentropy as loss
my_RNN1.compile(optimizer = optimizer, loss = loss)

In [31]:
#Early stopping
patience = 30
min_delta = 0.1
stopper = EarlyStopping(monitor = 'val_loss', patience = patience, min_delta = min_delta, restore_best_weights = True)

In [69]:
# training the RNN 

verbose = 1
epochs = 300
my_RNN1.fit(X_train_one_hot, y_train_one_hot, batch_size = 1000, verbose = verbose, epochs = epochs , 
           callbacks = [stopper], validation_data = (X_val_one_hot, y_val_one_hot)  )

Train on 14741 samples, validate on 4914 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300


<keras.callbacks.History at 0x1adcf7b7320>

# Generate a new text using the trained RNN

In [70]:
# prime the model with some initial starting string
starting_string = 'Where are yo'
# tokenize the starting string
starting_string_tokens = np.array(tokenizer.texts_to_sequences(starting_string))
# one_hot_encode the starting string
start_one_hot = np.array([one_hot_encode(starting_string_tokens, num_chars)])

In [71]:
# checking if the model is able to predict the character 'u' to complere the starting string?
output_txt = my_RNN1.predict_classes(start_one_hot)
output_txt

array([[ 6,  1,  0,  0,  0,  2,  5, 10,  0,  2,  4, 13]], dtype=int64)

In [72]:
tokenizer.sequences_to_texts(output_txt+1)[0][-1]

'u'

In [73]:
my_RNN1.predict(start_one_hot).shape

(1, 12, 83)

In [None]:
def next_char(tokenized_text, topk = 5):
    one_hot_text = np.array([one_hot_encode(tokenized_text, num_chars)])
    pred_char_prob = my_RNN1.predict(one_hot_text)[0,-1,:] # choose the predicted probabilities for the last time_step in the sequence
    top_prob_arg = np.argsort(pred_char_prob)[-topk:] # np.argsort sorts the values in increasing order and returns their original arguments
                                                      # to choose the last n elements of a numnpy array x, just call x[-n:]
                                                      # the negative index makes it count from the end
                                                      # https://stackoverflow.com/questions/646644/how-to-get-last-items-of-a-list-in-python
    char = np.random.choice(top_prob_arg+1, p = pred_char_prob[top_prob_arg]/np.sum(pred_char_prob[top_prob_arg]) )
    
    return char

In [75]:
def generate_text(starting_text, desired_length = 100):
    tokenize_starting_text = tokenizer.texts_to_sequences(starting_text)
    generated_text = tokenize_starting_text
    
    for itr in range(desired_length):
        nxt = next_char(np.array(generated_text))
        generated_text.append([nxt])
    
    output = tokenizer.sequences_to_texts(generated_text)
    
    return ''.join(output)
    

In [76]:
generate_text(starting_string, 200)

'Where are you, and a mersing a stang, and the whette that thas her\nwith and this said the pone the prentious wifh that to would somat was wherien and that they as the come on the homess, the somen of he was\nwen w'

This seems to have learnt some words but doesn't look as impressive as the LSTM based sequence generator based on PyTorch implementation in the Udacity execise.