In [None]:
import collections
import functools
import requests
import nltk
import re
import numpy as np
import pandas as pd
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    import tensorflow as tf

In [None]:
url = 'https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt'
text = requests.get(url).content.decode('utf-8')
text[:100]

In [None]:
def sent_to_tokens(sent, tokenizer):
    words = tokenizer.tokenize(sent)
    tokens = []
    for w in words:
        w = re.sub(r'https?://\S+', '<URL>', w)
        w = re.sub(r'#\S+', '<TOPIC>', w)
        w = re.sub(r'@\S+', '<USER>', w)
        tokens.append(w)
    return tokens


def basic_text_preprocess(text):
    # lemmatizer = nltk.stem.WordNetLemmatizer()
    text = text.lower().replace('-\n', '')
    custom_tokenize = functools.partial(sent_to_tokens, tokenizer=nltk.TweetTokenizer())
    tokens = [*map(custom_tokenize, nltk.tokenize.sent_tokenize(text))]
    # tokens = [[lemmatizer.lemmatize(word) for word in sent] for sent in tokens]
    print(f'corpus size: {len(tokens)}')
    return tokens

tokens = basic_text_preprocess(text)
tokens[:10]

In [None]:
def build_vocab(tokens):
    tok2idx = collections.defaultdict(lambda: 0)
    unique_tokens = list(functools.reduce(lambda a, b: set(a).union(b), tokens))
    print(len(unique_tokens))
    if len(unique_tokens) > 1000:
        max_len = int(len(unique_tokens) * 0.8)
        wc = collections.Counter(functools.reduce(lambda a, b: a + b, tokens))
        unique_tokens = [*map(lambda w: w[0], wc.most_common(max_len))]
    print(f'vocab size: {len(unique_tokens)}')
    idx2tok = ['<UNK>', '<START>', '<END>', '<PAD>'] + unique_tokens
    for i, tok in enumerate(idx2tok):
        tok2idx[tok] = i
    return tok2idx, idx2tok

tok2idx, idx2tok = build_vocab(tokens)

In [None]:
def pad_train(x, tok2idx, seq_len):
    x = [tok2idx['<START>']] + x
    if len(x) >= seq_len:
        return np.array(x[:seq_len], dtype=np.int32), seq_len
    pad_x = x + [tok2idx['<PAD>']] * (seq_len - len(x))
    return np.array(pad_x, dtype=np.int32), len(x)


def pad_target(y, tok2idx, seq_len):
    y = y + [tok2idx['<END>']]
    if len(y) >= seq_len:
        return np.array(y[:seq_len], dtype=np.int32), seq_len
    pad_y = y + [tok2idx['<PAD>']] * (seq_len - len(y))
    return np.array(pad_y, dtype=np.int32), len(y)


def batches_generator(batch_size, tokens, tok2idx, seq_len):
    n_samples = len(tokens)
    order = np.random.permutation(n_samples) # shuffle data
    n_batches = n_samples // batch_size + 1
    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list, y_list, max_len = [], [], 0
        for idx in order[batch_start:batch_end]:
            x_list.append([tok2idx['<START>']]+[tok2idx[t] for t in tokens[idx][:-1]])
            y_list.append([tok2idx[t] for t in tokens[idx][1:]] + [tok2idx['<END>']])
            max_len = max(max_len, len(tokens[idx]))
        X = np.ones([current_batch_size, max_len], dtype=np.int32) * tok2idx['<PAD>']
        Y = np.ones([current_batch_size, max_len], dtype=np.int32) * tok2idx['<PAD>']
        actual_lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            tok_len = len(x_list[n])
            actual_lengths[n] = tok_len
            X[n, :tok_len] = x_list[n]
            Y[n, :tok_len] = y_list[n]
        yield X, Y, actual_lengths

In [None]:
class NeuralLanguageModel:
    pass

In [None]:
def declare_placeholders(self):
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch')
    self.target_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='target_batch')
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths')
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[], name='dropout_rate')
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate')

In [None]:
NeuralLanguageModel.__declare_placeholders = classmethod(declare_placeholders)

In [None]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden):
    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    embedding_matrix = tf.Variable(initial_embedding_matrix, dtype=tf.float32, name='embedding_matrix')
    
    rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
    # rnn_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_hidden, activation=tf.nn.tanh)
    regulrized_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(
        cell=rnn_cell, input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph,
        state_keep_prob=self.dropout_ph)
    
    mul_cell = tf.nn.rnn_cell.MultiRNNCell(cells=[regulrized_rnn_cell]*2)
    embeddings = tf.nn.embedding_lookup(embedding_matrix, self.input_batch)
    output, state = tf.nn.dynamic_rnn(cell=mul_cell,
                                      inputs=embeddings,
                                      sequence_length=self.lengths,
                                      dtype=tf.float32)
    self.logits = tf.layers.dense(output, vocabulary_size, activation=None)

In [None]:
NeuralLanguageModel.__build_layers = classmethod(build_layers)

In [None]:
def compute_predictions(self):
    softmax_output = tf.nn.softmax(self.logits)
    self.predictions = tf.argmax(softmax_output, axis=-1)
    self.probs = softmax_output

In [None]:
NeuralLanguageModel.__compute_predictions = classmethod(compute_predictions)

In [None]:
def compute_loss(self, vocabulary_size, pad_index):
    targets_one_hot = tf.one_hot(self.target_batch, vocabulary_size)
    loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(labels=targets_one_hot,
                                                             logits=self.logits)
    mask = tf.cast(tf.not_equal(self.input_batch, pad_index), tf.float32)
    self.loss = tf.reduce_mean(tf.boolean_mask(loss_tensor, mask))

In [None]:
NeuralLanguageModel.__compute_loss = classmethod(compute_loss)

In [None]:
def perform_optimization(self):
    self.optimizer = tf.train.AdamOptimizer(self.learning_rate_ph)
    grads_and_vars = self.optimizer.compute_gradients(self.loss)
    
    clip_norm = tf.cast(5.0, tf.float32)
    self.grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in grads_and_vars]
    
    self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

In [None]:
NeuralLanguageModel.__perform_optimization = classmethod(perform_optimization)

In [None]:
def init_model(self, vocabulary_size, embedding_dim, n_hidden, pad_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden)
    self.__compute_predictions()
    self.__compute_loss(vocabulary_size, pad_index)
    self.__perform_optimization()

In [None]:
NeuralLanguageModel.__init__ = classmethod(init_model)

In [None]:
def train_on_batch(self, sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_prob):
    feed_dict = {
        self.input_batch: x_batch,
        self.target_batch: y_batch,
        self.learning_rate_ph: learning_rate,
        self.dropout_ph: dropout_keep_prob,
        self.lengths: lengths
    }
    sess.run(self.train_op, feed_dict=feed_dict)
    return sess.run(self.loss, feed_dict=feed_dict)

In [None]:
NeuralLanguageModel.train_on_batch = classmethod(train_on_batch)

In [None]:
def predict_for_batch(self, sess, x_batch, lengths):
    feed_dict = {
        self.input_batch: x_batch,
        self.lengths: lengths
    }
    predictions = sess.run(self.predictions, feed_dict=feed_dict)
    return predictions

In [None]:
NeuralLanguageModel.predict_for_batch = classmethod(predict_for_batch)

In [None]:
tf.reset_default_graph()

model = NeuralLanguageModel(vocabulary_size=len(tok2idx),
                            embedding_dim=256,
                            n_hidden=256,
                            pad_index=tok2idx['<PAD>'])
batch_size = 50
n_epochs = 4
learning_rate = 0.005
learning_rate_decay = 1.1
dropout_keep_probability = 0.7

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    
    # Train the model
    losses = []
    for x_batch, y_batch, lengths in batches_generator(batch_size, tokens, tok2idx, 100):
        batch_loss = model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate,
                                          dropout_keep_probability)
        losses.append(batch_loss * len(x_batch))
    print(f'traning loss: {np.sum(losses)/len(tokens)}')
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

In [None]:
def get_cond_prob(sess, model, seq_toks):
    feed_dict = {
        model.input_batch: seq_toks,
        model.lengths: [len(seq_toks)+10] * len(seq_toks)
    }
    return np.log1p(sess.run(model.probs, feed_dict=feed_dict)[:, -1])


def find_topk_2d(arr, topk):
#     arr[:, 2] = 0
#     arr[:, 6] = 0
#     arr[:, 5] = 0
#     arr[:, 7] = 0
#     arr[:, 11] = 0
#     arr[:, 4] = 0
    order = arr.reshape(-1,).argsort()[::-1][:topk]
    idx = np.unravel_index(order, arr.shape)
    vals = []
    for i in range(topk):
        vals.append(arr[idx[0][i], idx[1][i]])
    return vals, idx


def init_beam_search(sess, model, start_toks, topk):
    cond_probs = get_cond_prob(sess, model, start_toks)
    vals, idx = find_topk_2d(cond_probs, topk)
    seq_toks = np.column_stack((np.repeat(start_toks, topk, axis=0),
                                idx[1].reshape(-1,1)))
    seq_probs = np.reshape(vals, (-1, 1))
    return seq_toks, seq_probs


def extend_seq(seq_toks, idx):
    new_seq = []
    for i in range(len(seq_toks)):
        seq_idx = idx[0][i]
        tok_idx = idx[1][i]
        new_seq.append(np.array(list(seq_toks[seq_idx]) + [tok_idx]))
    return np.array(new_seq)


def calculate_seq_probs(seq_probs, cond_probs):
    return cond_probs + seq_probs.reshape(-1, 1)
    

def itrate_search(sess, seq_toks, seq_probs, topk):
    cond_probs = get_cond_prob(sess, model, seq_toks)
    seq_probs = calculate_seq_probs(seq_probs, cond_probs)
    vals, idx = find_topk_2d(seq_probs, topk)
    seq_toks = extend_seq(seq_toks, idx)
    seq_probs = seq_probs[idx[0], idx[1]]
    return seq_toks, seq_probs

def beam_search(sess, model, start_toks, topk, idx2tok):
    detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()
    seq_toks, seq_probs = init_beam_search(sess, model, start_toks, topk)
    for i in range(10):
        seq_toks, seq_probs = itrate_search(sess, seq_toks, seq_probs, topk)
    output_sents = []
    for seq in seq_toks:
        sent = detokenizer.detokenize([idx2tok[i] for i in seq])
        output_sents.append(sent)
    return output_sents, seq_toks, seq_probs

In [None]:
orig = tokens[99]
detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()
print(detokenizer.detokenize(orig))
start = orig[:1]
print(start)
start_toks = [[tok2idx['<START>']] + [tok2idx[w] for w in start]]
output_sents, seq_toks, seq_probs = beam_search(sess, model, start_toks, 20, idx2tok)

In [None]:
output_sents

In [None]:
tok2idx[',']

In [None]:
seq_toks = extend_seq(seq_toks, idx)

In [None]:
idx

In [None]:
seq_probs

In [None]:
new_seq

In [None]:
x.reshape(-1, ).argsort()[::-1][:3]

In [None]:
idx = np.unravel_index([7, 3, 5], x.shape)

In [None]:
topk = 3
for i in range(topk):
    print()