In [1]:
import collections
import re
from d2l import tensorflow as d2l

## 1. Load text data

In [58]:
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
                                '090b5e7e70c295757f55df93cb0a180b9691891a')

def read_time_machine():  #@save
    """Load the time machine dataset into a list of text lines."""
    with open(d2l.download('time_machine'), 'r') as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

lines = read_time_machine()
print(f'# text lines: {len(lines)}')
print(lines[0])
print(lines[10])

# text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


## 2. Tokenize

In [59]:
def tokenize(lines, token='word'):  #@save
    """Split text lines into word or character tokens."""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type: ' + token)

tokens = tokenize(lines, token='char')
for i in range(11):
    print(tokens[i])

['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 'm', 'a', 'c', 'h', 'i', 'n', 'e', ' ', 'b', 'y', ' ', 'h', ' ', 'g', ' ', 'w', 'e', 'l', 'l', 's']
[]
[]
[]
[]
['i']
[]
[]
['t', 'h', 'e', ' ', 't', 'i', 'm', 'e', ' ', 't', 'r', 'a', 'v', 'e', 'l', 'l', 'e', 'r', ' ', 'f', 'o', 'r', ' ', 's', 'o', ' ', 'i', 't', ' ', 'w', 'i', 'l', 'l', ' ', 'b', 'e', ' ', 'c', 'o', 'n', 'v', 'e', 'n', 'i', 'e', 'n', 't', ' ', 't', 'o', ' ', 's', 'p', 'e', 'a', 'k', ' ', 'o', 'f', ' ', 'h', 'i', 'm']
['w', 'a', 's', ' ', 'e', 'x', 'p', 'o', 'u', 'n', 'd', 'i', 'n', 'g', ' ', 'a', ' ', 'r', 'e', 'c', 'o', 'n', 'd', 'i', 't', 'e', ' ', 'm', 'a', 't', 't', 'e', 'r', ' ', 't', 'o', ' ', 'u', 's', ' ', 'h', 'i', 's', ' ', 'g', 'r', 'e', 'y', ' ', 'e', 'y', 'e', 's', ' ', 's', 'h', 'o', 'n', 'e', ' ', 'a', 'n', 'd']
['t', 'w', 'i', 'n', 'k', 'l', 'e', 'd', ' ', 'a', 'n', 'd', ' ', 'h', 'i', 's', ' ', 'u', 's', 'u', 'a', 'l', 'l', 'y', ' ', 'p', 'a', 'l', 'e', ' ', 'f', 'a', 'c', 'e', ' ', 'w', 'a', 's', ' ', 'f

## 3. Get vocab

In [60]:
from typing import List

class Vocab:
    '''
        Vacab extract `vocabulary` from given tokens (List[List[str]])
    '''
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
            
        self._token_freqs = self._get_token_freqs(tokens)
        self.idx_to_token = ["<unk>"] + reserved_tokens
        self.token_to_idx = {self.idx_to_token[idx]: idx for idx in range(len(self))}
        
        for token, freq in self._token_freqs:
            if freq < min_freq: break
                
            if token not in self.idx_to_token:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self)-1
        
    def __len__(self):
        '''
            Vocab_size
        '''
        return len(self.idx_to_token)
    
    def _get_token_freqs(self, tokens_list: List[List[str]]):
        '''
            get dict {token (str): amount_of_token}
        '''
        assert isinstance(tokens_list, list) and isinstance(tokens_list[0], list) and isinstance(tokens_list[0][0], str)
        
        tokens = []
        for line in tokens_list:
            for token in line:
                tokens.append(token)
        tokens_count = collections.Counter(tokens).items()
        return sorted(tokens_count, key=lambda x:x[1], reverse=True)
    
    def to_tokens(self, idx):
        '''
            convert indices (int) to token (str)
        '''
        if isinstance(idx, int):
            return self.idx_to_token[idx]
        
        return [self.idx_to_token[i] for i in idx]
    
    def __getitem__(self, tokens):
        '''
            convert token (str) to indices (int)
        '''
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
        
    @property
    def token_freqs(self):
        return self._token_freqs
    
    @property
    def unk(self):
        return self.token_to_idx['<unk>']
    
    @property
    def get_vocab(self):
        return self.idx_to_token

In [62]:
vocab = Vocab(tokens)
vocab['t']

3

In [64]:
vocab.to_tokens(19)

'y'

In [63]:
len(vocab)  # vocab size

28

## 4. Create load data function

In [65]:
import random
import tensorflow as tf

class SeqDataLoader:
    def __init__(self, batch_size, num_steps, tokens: List[List[str]], use_random_iter=False):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.vocab = Vocab(tokens)
        self.corpus = [token for line in tokens for token in line]
        
        if use_random_iter:
            self.data_iter_fn = self.seq_data_iter_random
        else:
            self.data_iter_fn = self.seq_data_iter_sequential
            
    def seq_data_iter_random(self, corpus, batch_size, num_steps): 
        '''
            Generator to generate X, Y with both shape (batch_size, num_steps) 
            Each X, Y sequence is randomly selected
        '''
        num_subseqs = (len(corpus) - 1) // num_steps
        initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
        
        random.shuffle(initial_indices)
        
        get_data = lambda pos: corpus[pos: pos+num_steps]
        num_batchs = num_subseqs // batch_size
        
        for i in range(0, batch_size * num_batches, batch_size):
        
            initial_indices_per_batch = initial_indices[i: i + batch_size]
            X = [data(j) for j in initial_indices_per_batch]
            Y = [data(j + 1) for j in initial_indices_per_batch]
            yield tf.constant(X), tf.constant(Y)
            
    def seq_data_iter_sequential(self, corpus, batch_size, num_steps): 
        '''
            Generator to generate X, Y with both shape (batch_size, num_steps) 
            Each X, Y sequence is collected in the original order
        '''
        offset = 0
        num_tokens = ((len(corpus) - 1) // batch_size) * batch_size
        
        Xs = tf.constant(corpus[offset: offset + num_tokens])
        Ys = tf.constant(corpus[offset + 1: offset + 1 + num_tokens])
        Xs = tf.reshape(Xs, (batch_size, -1))
        Ys = tf.reshape(Ys, (batch_size, -1))
        num_batches = Xs.shape[1] // num_steps
        for i in range(0, num_batches * num_steps, num_steps):
            X = Xs[:, i: i + num_steps]
            Y = Ys[:, i: i + num_steps]
            yield X, Y
            
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

In [66]:
def load_data_time_machine(batch_size, num_steps, tokens,
                           use_random_iter=False, max_tokens=10000):
    """Return the iterator and the vocabulary of the time machine dataset."""
    data_iter = SeqDataLoader( batch_size, num_steps, tokens)
    return data_iter, data_iter.vocab

In [67]:
batch_size, num_steps = 32, 35
train_iter, vocab_iter = load_data_time_machine( batch_size, num_steps, tokens )

In [68]:
i = 0
for X, Y in train_iter:
    print(X)
    i+=1
    if i == 2: break
    

tf.Tensor(
[[b't' b'h' b'e' ... b'e' b' ' b't']
 [b' ' b'c' b'a' ... b'n' b'd' b' ']
 [b's' b' ' b'h' ... b'a' b't' b'u']
 ...
 [b'h' b'r' b'e' ... b'u' b't' b' ']
 [b't' b' ' b'l' ... b's' b' ' b'o']
 [b't' b'h' b'e' ... b'o' b's' b's']], shape=(32, 35), dtype=string)
tf.Tensor(
[[b'i' b'm' b'e' ... b'c' b'o' b'n']
 [b'l' b'e' b'f' ... b'a' b'r' b'd']
 [b's' b'i' b's' ... b'o' b'r' b' ']
 ...
 [b'o' b'f' b' ' ... b' ' b'i' b' ']
 [b'f' b' ' b'h' ... b'n' b't' b' ']
 [b' ' b'u' b'p' ... b'i' b'l' b' ']], shape=(32, 35), dtype=string)


In [69]:
X.shape, Y.shape

(TensorShape([32, 35]), TensorShape([32, 35]))

## 5. Model init

$$X : \text{intput with shape (timesteps, batch_size, vocab_size)}$$

$\text{At each timestep,}$

$\text{X} = [X ; H]$ ; (batch_size, vocab_size + hidden)

$H = \phi (X W_h+ b_h)$ ; (batch_size, hidden)

$O = W_q H + b_q$ ; (batch_size, num_outputs)

$\text{Thus, for all timesteps in 1 minibatch we get (timestep * batch_size, num_outputs) }$

In [157]:
class RNNModelScratch:
    
    def __init__(self, vocab_size, num_hiddens, initial_state_fn):
        
        self.initial_state_fn = initial_state_fn
        self.vocab_size = vocab_size
        self.num_hiddens = num_hiddens
        
        num_outputs = vocab_size
        
        normal = lambda shape: tf.random.normal(shape=shape, stddev=0.01, mean=0, dtype=tf.float32)
        
        self.W_h = tf.Variable(normal(shape = (vocab_size + num_hiddens, num_hiddens)), dtype=tf.float32)
        self.b_h = tf.Variable(normal(shape = (num_hiddens, )) , dtype=tf.float32)
        self.W_hq= tf.Variable(normal(shape = (num_hiddens, num_outputs)), dtype=tf.float32)
        self.b_q = tf.Variable(normal(shape = (num_outputs, )), dtype=tf.float32)
        
    
    def forward_fn(self, inputs, state):
        """
            inputs: 1 minibatch = (timesteps, batch_size, vocab_size)
        """
        outputs = []   # outputs of this batch
        H = state
        for X in inputs:   # for each timestep                 ; X = (batch_size, vocab_size)
            X = tf.concat([X, H], axis=1)                      # X = (batch_size, vocab_size + hidden)
            H = tf.tanh( tf.matmul(X, self.W_h) + self.b_h )   # H = (batch_size, hidden)
            O = tf.matmul(H, self.W_hq) + self.b_q             # O = (batch_size, num_outputs)
            outputs.append(O)
        return tf.concat(outputs, axis=0), H                   # y_pred = (timesteps * batch_size, num_outputs)
    
    def __call__(self, X, state):
        return self.forward_fn(X, state)
    
    def begin_state(self, batch_size):
        """
            this fn initialize and return initial state
        """
        return self.initial_state_fn(batch_size, self.num_hiddens)
        
    def _get_params(self):
        return [self.W_h, self.b_h, self.W_hq, self.b_q]
    
    @property
    def trainables(self):
        return self._get_params()
        
    @property
    def get_state(self):
        return self.state

### Test model

In [154]:
num_hiddens = 512
batch_size = 2
num_steps  = 1

initial_state_fn = lambda batch_size, num_hiddens: tf.zeros((batch_size, num_hiddens), dtype=tf.float32)

In [155]:
len(vocab)

28

In [156]:
net = RNNModelScratch(
    vocab_size = len(vocab), 
    num_hiddens = 512, 
    initial_state_fn = initial_state_fn
)
initial_state = net.begin_state(batch_size)

X = tf.random.normal(shape=(num_steps, batch_size, len(vocab)))
Y, new_state = net(X, initial_state)

Y.shape, new_state.shape

outputs.shape : (2, 28)


(TensorShape([2, 28]), TensorShape([2, 512]))

## 6. Prediction

In [158]:
batch_size = 1
num_hiddens = 512

net = RNNModelScratch(len(vocab), num_hiddens, initial_state_fn=initial_state_fn)
initial_state = net.begin_state(batch_size)

In [159]:
def predict(prefix, net, num_preds, vocab):
    state = net.begin_state(batch_size=1)
    outputs = [vocab[prefix[0]]]
    
    def get_inputs():
        return tf.one_hot( tf.reshape(outputs[-1], (1,1)), len(vocab))
    
    ## Accumulate information from all prefix
    for y in prefix[1:]:
        _, state = net(get_inputs(), state)
        outputs.append(vocab[y])
        
    ## Make prediction
    for _ in range(num_preds):
        y, state = net(get_inputs(), state)
        y_pred = tf.argmax(y, axis=1)[0].numpy()
        
        outputs.append(y_pred)
        
    return outputs

In [164]:
net = RNNModelScratch(len(vocab), num_hiddens=512, initial_state_fn=initial_state_fn)
O = predict('time traveller ', net, 10, vocab)

''.join([vocab.idx_to_token[i] for i in O])

'time traveller <unk>sssssssss'

## 7. Training

In [169]:
batch_size = 32
num_steps = 10
train_iter, vocab_iter = d2l.load_data_time_machine(
    batch_size, num_steps, use_random_iter=False)

for X, Y in train_iter:
    print(X.shape)   # (batch_size, num_steps)
    print(Y.shape)
    break

(32, 10)
(32, 10)


In [129]:
def grad_clipping(grads, theta):  #@save
    """Clip the gradient."""
    theta = tf.constant(theta, dtype=tf.float32)
    new_grad = []
    for grad in grads:
        if isinstance(grad, tf.IndexedSlices):
            new_grad.append(tf.convert_to_tensor(grad))
        else:
            new_grad.append(grad)
    norm = tf.math.sqrt(sum((tf.reduce_sum(grad ** 2)).numpy()
                        for grad in new_grad))
    norm = tf.cast(norm, tf.float32)
    if tf.greater(norm, theta):
        for i, grad in enumerate(new_grad):
            new_grad[i] = grad * theta / norm
    else:
        new_grad = new_grad
    return new_grad

In [165]:
import numpy as np

def train_epoch_ch8(net, train_iter, loss_fn, updater, use_random_iter):
    
    def get_inputs(X):
        X = tf.one_hot(X, len(vocab))        # X = (batch_size, timesteps, vocab_size)
        X = tf.transpose(X, perm=[1, 0, 2])  # X = (timesteps,  batch_size,vacab_size)
        return X
    
    state = None
    metric = d2l.Accumulator(2)
    for X, Y in train_iter:
        if state is None or use_random_iter:   ## If random_iter -> init state at every batch/ If seq_iter -> state can accumulate information over full epoch
            state = net.begin_state(batch_size=X.shape[0])
            
        with tf.GradientTape(persistent=True) as tape:
            y_pred, state = net(get_inputs(X), state)    # y_pred = (timesteps * batch_size, vocab_size)
            y = tf.reshape(tf.transpose(Y), (-1))        # Y = (batch_size, timesteps) -> y = (timestep * batch_size)
            loss = loss_fn(y_true = y, y_pred=y_pred)
            
        params = net.trainables
        grads = tape.gradient(loss, params)
        updater.apply_gradients(zip(grads, params))
        
        metric.add(loss * d2l.size(y), d2l.size(y))
    return np.exp(metric[0] / metric[1])

In [166]:
def train_ch8(net, train_iter, vocab, lr, num_epochs, use_random_iter=False):
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    updater = tf.keras.optimizers.SGD(lr)
    
    get_predict = lambda prefix: predict(prefix, net, 50, vocab)
    
    for e in range(num_epochs):
        ppl = train_epoch_ch8(net, train_iter, loss, updater, use_random_iter)
        
        if (e + 1) % 10 == 0:
            print(get_predict('time traveller'))
        print(f'perplexity {ppl:.1f}')

In [167]:
num_epochs, lr = 500, 1
net = RNNModelScratch(len(vocab), num_hiddens=512, initial_state_fn=initial_state_fn)

train_ch8(net, train_iter, vocab, lr, num_epochs)

perplexity 19.7
perplexity 16.5
perplexity 13.7
perplexity 11.9
perplexity 11.1
perplexity 10.6
perplexity 10.3
perplexity 9.9
perplexity 9.7
[3, 5, 13, 2, 1, 3, 10, 4, 22, 2, 12, 12, 2, 10, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1, 3, 9, 2, 1]
perplexity 9.5
perplexity 9.4
perplexity 9.1
perplexity 9.1


KeyboardInterrupt: 

In [172]:
tf.transpose(Y)

<tf.Tensor: shape=(10, 32), dtype=int32, numpy=
array([[15, 21,  1, 10,  4,  4, 19,  3, 12,  5, 12, 20,  3, 19,  8,  7,
         2,  4, 15,  9,  3, 18,  7, 12,  9,  7,  2, 19,  1, 10,  6, 13],
       [ 9, 14, 16,  4, 14,  6,  1,  7,  2,  7,  2, 10,  9,  7,  1, 17,
         1, 10,  1,  2,  9,  7,  1,  2,  4, 16, 10,  2, 16,  1,  2,  3],
       [ 5, 21,  7,  3, 18,  1,  7, 14, 10, 14,  6,  7,  2, 14,  7,  8,
        17,  2, 26, 10,  2,  1, 14, 10, 12,  1, 13,  8,  4,  9,  1,  9],
       [ 6, 21, 10,  9,  9,  5, 16, 10,  1,  8, 18, 16, 19,  1, 16,  1,
         4,  1, 14,  2,  1, 21, 20, 19, 12,  3,  4,  1,  5,  2,  8,  2],
       [ 2, 12,  2,  2,  3,  6,  1,  1, 17,  6,  3,  2,  1,  5,  1,  3,
         8, 19, 13,  3,  2,  4,  1,  7,  1,  9,  6,  8,  6,  1, 14,  6],
       [ 1,  2, 16, 10,  1,  8,  3, 15,  5,  2,  9,  8, 15,  1,  9,  9,
         1,  7, 20,  9,  4, 15,  4, 14,  3,  2,  1,  7,  3,  9, 21,  1],
       [21,  8,  5,  1, 19,  3,  9,  7,  3,  8, 21,  8,  7,  9,  5,  2,
         4

In [171]:
tf.reshape(tf.transpose(Y), (-1))

<tf.Tensor: shape=(320,), dtype=int32, numpy=
array([15, 21,  1, 10,  4,  4, 19,  3, 12,  5, 12, 20,  3, 19,  8,  7,  2,
        4, 15,  9,  3, 18,  7, 12,  9,  7,  2, 19,  1, 10,  6, 13,  9, 14,
       16,  4, 14,  6,  1,  7,  2,  7,  2, 10,  9,  7,  1, 17,  1, 10,  1,
        2,  9,  7,  1,  2,  4, 16, 10,  2, 16,  1,  2,  3,  5, 21,  7,  3,
       18,  1,  7, 14, 10, 14,  6,  7,  2, 14,  7,  8, 17,  2, 26, 10,  2,
        1, 14, 10, 12,  1, 13,  8,  4,  9,  1,  9,  6, 21, 10,  9,  9,  5,
       16, 10,  1,  8, 18, 16, 19,  1, 16,  1,  4,  1, 14,  2,  1, 21, 20,
       19, 12,  3,  4,  1,  5,  2,  8,  2,  2, 12,  2,  2,  3,  6,  1,  1,
       17,  6,  3,  2,  1,  5,  1,  3,  8, 19, 13,  3,  2,  4,  1,  7,  1,
        9,  6,  8,  6,  1, 14,  6,  1,  2, 16, 10,  1,  8,  3, 15,  5,  2,
        9,  8, 15,  1,  9,  9,  1,  7, 20,  9,  4, 15,  4, 14,  3,  2,  1,
        7,  3,  9, 21,  1, 21,  8,  5,  1, 19,  3,  9,  7,  3,  8, 21,  8,
        7,  9,  5,  2,  4, 14,  5,  2, 10, 23, 18,  1,

In [141]:
O = predict('time traveller ', net, 50, vocab)

''.join([vocab.idx_to_token[i] for i in O])

'time traveller held in hing that is jelinctime se have not said t'

In [246]:
state = net.begin_state(batch_size=X.shape[0])
X = tf.one_hot(X, len(vocab)) # (batch_size, num_steps, vocab_size)
X = tf.transpose(X, perm=[1, 0, 2]) # (num_steps, batch_size, vocab_size)
y_hat, s = net(X, state)

In [247]:
y_hat.shape

TensorShape([320, 28])

In [248]:
X.shape

TensorShape([10, 32, 28])