# Preprocessing data

In [1]:
anna = open('anna.txt', 'r')
txt = anna.read()

In [2]:
txt[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

### charcater tokenizer

 encode characters as integers. In Keras this can be done by the Tokenizer class

In [3]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


- By default, the Tokenizer class tokenizes the words in the text rather than individual characters. This can be changed by setting char_level = True

- The default tokenizer converts all alphabets to lower case, this can be changed by setting lower = False

- The default tokeniser ignores all punctuations, tabs and line breaks etc. This can be changed by passing an explicit list of characters to the keyworded argument 'filters' 



In [4]:
# we will keep all the default characters in the filter except line breaks, question marks, fullstops and exclamation marks
fltr = '"#$%&()*+,-/:;<=>@[\\]^_`{|}~\t' 

# create a tokenizer instance
# we will also require the text to be Case sensitive
tokenizer = Tokenizer(filters = fltr, lower = False, char_level = True)

 the Tokenizer should be thought of analogously to data transformers in sklearn so we first fit then to our training text and the use the fitten tokenizer to transform any given text.

In [5]:
# fit the tokenizer
# we can fit it on a list of multiple different texts
tokenizer.fit_on_texts([txt])

In [6]:
# chaecking the tokenizer indeed produces a sensible output
sample = 'How are you doing?'

tokenizer.texts_to_sequences([sample])

[[37, 5, 15, 1, 4, 10, 2, 1, 19, 5, 14, 1, 11, 5, 8, 6, 18, 35]]

In [7]:
# converting a sequence back to text
tokenizer.sequences_to_texts([[37, 5, 15, 1, 4, 10, 2, 1, 19, 5, 14, 1, 11, 5, 8, 6, 18, 35]])

['H o w   a r e   y o u   d o i n g ?']

In [8]:
# number of distinct characters in the text
print('Number of distinct characters in the text: {}'.format(len(tokenizer.word_index)))

Number of distinct characters in the text: 83


In [9]:
# total number of characters in the text
# the attribute .word_counts gives the number of times each character/word appears in the text
# it returns an ordered dictionary with the characters as its keys and their count as the corresponding value
character_count = tokenizer.word_counts

# computations on elements of a list can be done efficiently through reduce 
# for e.g. see here : https://book.pythontips.com/en/latest/map_filter.html
from functools import reduce

total_chars = reduce(lambda x, y: x+y , character_count.values())
print('total number of characters in the text: {}'.format(total_chars))

total number of characters in the text: 1985223


In [10]:
import numpy as np

In [11]:
# converting the whole text to a sequence of integers using tokenizer
encoded = np.array(tokenizer.texts_to_sequences([txt]))
encoded

array([[53,  7,  4, ...,  9, 24, 13]])

In [12]:
encoded.shape

(1, 1985223)

## Extract sequences from the text

We will now extract split the text into sequences. One way to do this is to simply to appropriately reshape the array containing the text. Note that Aurelion Geron extracts the sequences by using window() method of tf.data.Dataset class. We will learn how to do this later, but for now let's just simply reshape the arrays.

In [21]:
seq_len = 100
num_seq = encoded.shape[1]//(seq_len+1) # we will include one extra character as the target character for the last time_step in the sequence
print('expected number of sequences in the text: {}'.format(num_seq))

expected number of sequences in the text: 19655


In [24]:
sequences = encoded[:,:num_seq*(seq_len+1)].reshape(num_seq, seq_len+1,1)
sequences.shape

(19655, 101, 1)

In [25]:
# obtaining the input and target sequences 
X = sequences[:,:seq_len,:]
y = sequences[:, 1:seq_len+1, :]

In [29]:
X[0, 90:]

array([[13],
       [57],
       [25],
       [ 2],
       [10],
       [19],
       [ 3],
       [ 7],
       [ 8],
       [ 6]])

In [30]:
y[0, 90:]

array([[57],
       [25],
       [ 2],
       [10],
       [19],
       [ 3],
       [ 7],
       [ 8],
       [ 6],
       [18]])

## Split text into training and validation set

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
random_seed = 42
train_size = 0.75
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = train_size, random_state = random_seed)



In [39]:
print('shape of X_train: {}'.format(X_train.shape))
print('shape of y_train: {}'.format(y_train.shape))
print('shape of X_val: {}'.format(X_val.shape))
print('shape of y_val: {}'.format(y_val.shape))

shape of X_train: (14741, 100, 1)
shape of y_train: (14741, 100, 1)
shape of X_val: (4914, 100, 1)
shape of y_val: (4914, 100, 1)


Check that the target sequences indeed represent the next character in the input sequence.

In [35]:
X_train[200, 90:]

array([[ 4],
       [ 9],
       [ 1],
       [17],
       [14],
       [16],
       [ 7],
       [ 1],
       [14],
       [ 6]])

In [36]:
y_train[200, 90:]

array([[ 9],
       [ 1],
       [17],
       [14],
       [16],
       [ 7],
       [ 1],
       [14],
       [ 6],
       [11]])

In [37]:
X_val[371, 90:]

array([[ 7],
       [ 2],
       [ 1],
       [16],
       [ 5],
       [14],
       [ 6],
       [ 3],
       [ 8],
       [ 6]])

In [38]:
y_val[371, 90:]

array([[ 2],
       [ 1],
       [16],
       [ 5],
       [14],
       [ 6],
       [ 3],
       [ 8],
       [ 6],
       [18]])

## One-hot-encoding the characters

In [117]:
# function to given batch of sequence into a sequence of one-hot-encoded vectors of a given dimension
# here we are assuming that the input array will be a 3d array of shape (batch_size, seq_length, 1)
# we want the output to be a 3d array of shape (batch_size, seq_length, encoding_dim)

def one_hot_encode(sequence, encoding_dim):
    
    # shape of the output array
    out_shape = (sequence.size, encoding_dim)
    
    one_hot_arr = np.zeros(out_shape)
    
    one_hot_arr[np.arange(sequence.size), sequence.flatten()-1] = 1  # This is based on the fact that in numpyt, the operation
                                                                   # X[[a1, a2, ..], [b1, b2, ...]] = y
                                                                   # produces the same output as
                                                                   # X[a1,b1] =y, X[a2,b2] = y, ....
                
    # reshape the one_hot_arr to have the shape shape as sequences 
    one_hot_arr = one_hot_arr.reshape(*sequence.shape[:-1], encoding_dim)
    
    return one_hot_arr

In [118]:
# testing one_hot_encode
seq = np.array([[[1], [5], [6]],[[3],[8],[4]]])
one_hot_encode(seq, 10)

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]]])

In [120]:
# now one_hot_encoding all the sequences 
num_chars = len(tokenizer.word_index) # number of distinct characters in the text


X_train_one_hot = one_hot_encode(X_train, num_chars)
print('shape of one_hot_encoded X_train: {}'.format(X_train_one_hot.shape))
y_train_one_hot = one_hot_encode(y_train, num_chars)
print('shape of one_hot_encoded y_train: {}'.format(y_train_one_hot.shape))
X_val_one_hot = one_hot_encode(X_val, num_chars)
print('shape of one_hot_encoded X_val: {}'.format(X_val_one_hot.shape))
y_val_one_hot = one_hot_encode(y_val, num_chars)
print('shape of one_hot_encoded y_val: {}'.format(y_val_one_hot.shape))

shape of one_hot_encoded X_train: (14741, 100, 83)
shape of one_hot_encoded y_train: (14741, 100, 83)
shape of one_hot_encoded X_val: (4914, 100, 83)
shape of one_hot_encoded y_val: (4914, 100, 83)
