### GRU and LSTM implementation with TensorFlow to build a language model
#### It is based on my previous notebook. Now I'm trying to improve my
#### previous character-level language model by using GRU and LSTM units
#### and making the model deeper

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import random
from typing import Union
from math import ceil, sqrt
from os import mkdir, listdir

In [None]:
EOS = chr(10) # End of sentence

def build_vocabulary() -> list:
    # builds a vocabulary using ASCII characters
    vocabulary = [chr(i) for i in range(10, 128)]
    return vocabulary

def word2index(vocabulary: list, word: str) -> int:
    # returns the index of 'word' in the vocabulary
    return vocabulary.index(word)

def words2onehot(vocabulary: list, words: list) -> np.ndarray:
    # transforms the list of words given as argument into
    # a one-hot matrix representation using the index in the vocabulary
    n_words = len(words)
    n_voc = len(vocabulary)
    indices = np.array([word2index(vocabulary, word) for word in words])
    a = np.zeros((n_words, n_voc))
    a[np.arange(n_words), indices] = 1
    return a

def sample_word(vocabulary: list, prob: np.ndarray, threshold: float) -> str:
    # sample a word from the vocabulary according to 'prob'
    # probability distribution (the softmax output of our model)
    
    prob = prob.tolist()
    vocab_prob = [[vocabulary[i], prob[i]] for i in range(len(prob))]
    vocab_prob.sort(reverse=True, key=lambda e: e[1])
    
    s = 0
    for i in range(len(vocab_prob)):
        if s > threshold:
            vocab_prob[i][1] = 0
        s += vocab_prob[i][1]
        
    vocab = [w for w, p in vocab_prob]
    prob = np.array([p/s for w, p in vocab_prob])
    
    return np.random.choice(vocab, p=prob)

In [None]:
class Model:
    def __init__(self,
                 vocabulary: list,
                 inter_time_step_size: int,
                 unit_type: str,
                 depth: int):
        
        # unit_type can be one of: 'gru' or 'lstm'
        # depth >= 1
        
        self._init(vocabulary, inter_time_step_size, unit_type, depth)
        
        self.weights = (self._init_gru() if unit_type == 'gru'
                                         else self._init_lstm())
        
        # weights and bias used to compute y (the softmax predictions)
        self.wy = tf.Variable(tf.random.normal(
            stddev=sqrt(2.0/(self.inter_time_step_size+self.vocab_size)),
            shape=(self.inter_time_step_size, self.vocab_size),
            dtype=tf.double))
        self.by = tf.Variable(tf.random.normal(
            stddev=sqrt(2.0/(1+self.vocab_size)),
            shape=(1, self.vocab_size),
            dtype=tf.double))
        
        self.weights.extend([self.wy, self.by])
    
    def _init(self,
              vocabulary: list,
              inter_time_step_size: int,
              unit_type: str,
              depth: int):
        
        self.vocab = vocabulary
        self.vocab_size = len(vocabulary)
        self.inter_time_step_size = inter_time_step_size
        self.combined_size = self.vocab_size + self.inter_time_step_size
        
        self.unit_type = unit_type
        self.depth = depth
        
        self.weights_shape_0 = (self.combined_size, self.inter_time_step_size)
        self.weights_std_dev_0 = sqrt(2.0/(self.combined_size+self.inter_time_step_size))
        
        self.weights_shape_1 = (2*self.inter_time_step_size, self.inter_time_step_size)
        self.weights_std_dev_1 = sqrt(2.0/(3*self.inter_time_step_size))
        
        self.biases_shape = (1, self.inter_time_step_size)
        self.biases_std_dev = sqrt(2.0/(1+self.inter_time_step_size))
        
        self.w_shapes = [(self.weights_shape_0, self.weights_std_dev_0)]
        self.w_shapes.extend([(self.weights_shape_1, self.weights_std_dev_1)
                              for i in range(self.depth-1)])
        self.b_shapes = [(self.biases_shape, self.biases_std_dev)
                         for i in range(self.depth)]
        
        self.optimizer = tf.keras.optimizers.Adam()
    
    def _init_gru(self):
        
        for s in ['wr', 'wu', 'wa']:
            setattr(self, s, [tf.Variable(tf.random.normal(
                                    stddev=std_dev,
                                    shape=shape,
                                    dtype=tf.double))
                               for shape, std_dev in self.w_shapes])
        
        for s in ['br', 'bu', 'ba']:
            setattr(self, s, [tf.Variable(tf.random.normal(
                                    stddev=std_dev,
                                    shape=shape,
                                    dtype=tf.double))
                               for shape, std_dev in self.b_shapes])
        
        all_weights = []
        for w in [self.wr, self.br, self.wu, self.bu, self.wa, self.ba]:
            all_weights.extend(w)
        
        return all_weights
    
    def _init_lstm(self):
        
        for s in ['wu', 'wf', 'wo', 'wc']:
            setattr(self, s, [tf.Variable(tf.random.normal(
                                    stddev=std_dev,
                                    shape=shape,
                                    dtype=tf.double))
                               for shape, std_dev in self.w_shapes])
        
        for s in ['bu', 'bf', 'bo', 'bc']:
            setattr(self, s, [tf.Variable(tf.random.normal(
                                    stddev=std_dev,
                                    shape=shape,
                                    dtype=tf.double))
                               for shape, std_dev in self.b_shapes])
        
        all_weights = []
        for w in [self.wu, self.bu, self.wf, self.bf,
                  self.wo, self.bo, self.wc, self.bc]:
            all_weights.extend(w)
        
        return all_weights
    
    def reset_state(self, num_samples: int) -> None:
        def get_init_values():
            return [tf.zeros((num_samples, self.inter_time_step_size),
                             dtype=tf.double) for i in range(self.depth)]
        
        self.a = get_init_values()
        if self.unit_type == 'lstm':
            self.c = get_init_values()
    
    def __call__(self,
                 x: Union[np.ndarray, tf.Tensor],
                 y: Union[np.ndarray, tf.Tensor, None] = None) -> tf.Tensor:
        
        for i in range(self.depth):
            x = self._call_level(i, x)
        
        y_logits = tf.linalg.matmul(x, self.wy)+self.by
        if y is None:
            # during prediction return softmax probabilities
            return tf.nn.softmax(y_logits)
        else:
            # during training return loss
            return tf.math.reduce_mean(
                        tf.nn.softmax_cross_entropy_with_logits(y, y_logits))
    
    def _call_level(self,
                    level: int,
                    x: Union[np.ndarray, tf.Tensor]) -> tf.Tensor:
        
        return (self._call_gru(level, x) if self.unit_type == 'gru'
                                         else self._call_lstm(level, x))
    
    def _call_gru(self,
                  level: int,
                  x: Union[np.ndarray, tf.Tensor]) -> tf.Tensor:
        
        n = x.shape[0]
        
        self.a[level] = self.a[level][0:n]
        
        concat_matrix = tf.concat([self.a[level], x], axis=1)
        
        relevance_gate = tf.math.sigmoid(
                                tf.linalg.matmul(concat_matrix, self.wr[level])
                                + self.br[level])
        update_gate = tf.math.sigmoid(
                                tf.linalg.matmul(concat_matrix, self.wu[level])
                                + self.bu[level])
        
        a_candidate = tf.math.tanh(
                          tf.linalg.matmul(
                              tf.concat([tf.math.multiply(relevance_gate, self.a[level]), x], axis=1),
                              self.wa[level])
                          + self.ba[level])
        
        self.a[level] = (tf.math.multiply(update_gate, a_candidate) +
                         tf.math.multiply((1-update_gate), self.a[level]))
        
        return self.a[level]
    
    def _call_lstm(self,
                   level: int,
                   x: Union[np.ndarray, tf.Tensor]) -> tf.Tensor:
        
        n = x.shape[0]
        
        self.a[level] = self.a[level][0:n]
        self.c[level] = self.c[level][0:n]
        
        concat_matrix = tf.concat([self.a[level], x], axis=1)
        
        update_gate = tf.math.sigmoid(
                            tf.linalg.matmul(concat_matrix, self.wu[level])
                            + self.bu[level])
        forget_gate = tf.math.sigmoid(
                            tf.linalg.matmul(concat_matrix, self.wf[level])
                            + self.bf[level])
        output_gate = tf.math.sigmoid(
                            tf.linalg.matmul(concat_matrix, self.wo[level])
                            + self.bo[level])
        
        c_candidate = tf.math.tanh(
                            tf.linalg.matmul(concat_matrix, self.wc[level])
                            + self.bc[level])
        
        self.c[level] = (tf.math.multiply(update_gate, c_candidate) +
                         tf.math.multiply(forget_gate, self.c[level]))
        
        self.a[level] = tf.math.multiply(output_gate, tf.math.tanh(self.c[level]))
        
        return self.a[level]
    
    def fit(self,
            sentences: list,
            batch_size: int = 128,
            epochs: int = 10) -> None:
        
        n_sent = len(sentences)
        num_batches = ceil(n_sent / batch_size)
        
        for epoch in range(epochs):
            
            random.shuffle(sentences)
            start = 0
            batch_idx = 0
            
            while start < n_sent:
                
                print('Training model: %05.2f%%' %
                      (100*(epoch*num_batches+batch_idx+1)/(epochs*num_batches),),
                      end='\r')
                
                batch_idx += 1
                end = min(start+batch_size, n_sent)
                batch_sent = sentences[start:end]
                start = end
                batch_sent.sort(reverse=True, key=lambda s: len(s))
                
                init_num_words = len(batch_sent)
                self.reset_state(init_num_words)
                x = np.zeros((init_num_words, self.vocab_size))
                
                time_steps = len(batch_sent[0])+1
                
                with tf.GradientTape() as tape:
                
                    losses = []
                    for t in range(time_steps):
                        words = []
                        for i in range(init_num_words):
                            if t > len(batch_sent[i]):
                                break
                            if t == len(batch_sent[i]):
                                words.append(EOS)
                                break
                            words.append(batch_sent[i][t])

                        y = words2onehot(self.vocab, words)
                        n = y.shape[0]
                        loss = self(x[0:n], y)
                        losses.append(loss)
                        x = y
                    
                    loss_value = tf.math.reduce_mean(losses)
                
                grads = tape.gradient(loss_value, self.weights)
                self.optimizer.apply_gradients(zip(grads, self.weights))

    def sample(self, threshold: float = 0.9) -> str:
        # sample a new sentence from the learned model
        sentence = ''
        self.reset_state(1)
        x = np.zeros((1, self.vocab_size))
        while True:
            y_hat = self(x)
            word = sample_word(self.vocab,
                               tf.reshape(y_hat, (-1,)).numpy(),
                               threshold)
            if word == EOS:
                break
            sentence += word
            x = words2onehot(self.vocab, [word])
        return sentence
    
    def predict_next(self, sentence: str,
                     threshold: float = 0.9) -> str:
        
        # predict the next part of the sentence given as parameter
        
        self.reset_state(1)
        for word in sentence.strip():
            x = words2onehot(self.vocab, [word])
            y_hat = self(x)
        s = ''
        while True:
            word = sample_word(self.vocab,
                               tf.reshape(y_hat, (-1,)).numpy(),
                               threshold)
            if word == EOS:
                break
            s += word
            x = words2onehot(self.vocab, [word])
            y_hat = self(x)
        return s
    
    def save(self, name: str) -> None:
        mkdir(f'./{name}')
        mkdir(f'./{name}/weights')
        with open(f'./{name}/vocabulary.txt', 'w') as f:
            f.write('[separator]'.join(self.vocab))
        with open(f'./{name}/inter_time_step_size.txt', 'w') as f:
            f.write(str(self.inter_time_step_size))
        with open(f'./{name}/unit_type.txt', 'w') as f:
            f.write(self.unit_type)
        with open(f'./{name}/depth.txt', 'w') as f:
            f.write(str(self.depth))
        
        if self.unit_type == 'gru':
            for s in ['wr', 'br', 'wu', 'bu', 'wa', 'ba']:
                for i in range(self.depth):
                    np.save(f'./{name}/weights/{s}_{i}.npy',
                            getattr(self, s)[i].numpy())
        else:
            for s in ['wu', 'bu', 'wf', 'bf', 'wo', 'bo', 'wc', 'bc']:
                for i in range(self.depth):
                    np.save(f'./{name}/weights/{s}_{i}.npy',
                            getattr(self, s)[i].numpy())
        
        np.save(f'./{name}/weights/wy.npy', self.wy.numpy())
        np.save(f'./{name}/weights/by.npy', self.by.numpy())
    
    def load(self, name: str) -> None:
        with open(f'./{name}/vocabulary.txt', 'r') as f:
            vocabulary = f.read().split('[separator]')
        with open(f'./{name}/inter_time_step_size.txt', 'r') as f:
            inter_time_step_size = int(f.read())
        with open(f'./{name}/unit_type.txt', 'r') as f:
            unit_type = f.read()
        with open(f'./{name}/depth.txt', 'r') as f:
            depth = int(f.read())
        
        self._init(vocabulary, inter_time_step_size,
                   unit_type, depth)
        
        weights_names = []
        filenames = listdir(f'./{name}/weights')
        filenames.sort()
        for filename in filenames:
            if filename in ['wy.npy', 'by.npy']:
                continue
            
            attr_name, index = filename.replace('.npy', '').split('_')
            index = int(index)
            
            if index == 0:
                setattr(self, attr_name, [])
                weights_names.append(attr_name)
            getattr(self, attr_name).append(
                tf.Variable(np.load(f'./{name}/weights/{filename}')))
        
        self.wy = tf.Variable(np.load(f'./{name}/weights/wy.npy'))
        self.by = tf.Variable(np.load(f'./{name}/weights/by.npy'))
        
        self.weights = [getattr(self, weight_name)
                        for weight_name in weights_names]
        self.weights.extend([self.wy, self.by])

In [None]:
df = pd.read_csv('../input/million-headlines/abcnews-date-text.csv')
df

In [None]:
vocabulary = build_vocabulary()

In [None]:
sentences = df['headline_text'].values.tolist()

In [None]:
model = Model(vocabulary,
              inter_time_step_size=512,
              unit_type='lstm',
              depth=3)

In [None]:
model.fit(sentences, batch_size=1024, epochs=20)

In [None]:
model.save('news_headlines_model')
# model.load('news_headlines_model')

In [None]:
for i in range(20):
    print(model.sample())

In [None]:
s = 'scientists just discovered'
s += model.predict_next(s)
s