In [0]:
%tensorflow_version 1.x

In [0]:
import collections
import functools
import requests
import nltk
import re
import numpy as np
import pandas as pd
import warnings
import tensorflow as tf

In [5]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [6]:
url = 'https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt'
text = requests.get(url).content.decode('utf-8')
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def sent_to_tokens(sent, tokenizer):
    words = tokenizer.tokenize(sent)
    tokens = []
    for w in words:
        w = re.sub(r'https?://\S+', '<URL>', w)
        w = re.sub(r'#\S+', '<TOPIC>', w)
        w = re.sub(r'@\S+', '<USER>', w)
        tokens.append(w)
    return tokens


def basic_text_preprocess(text):
    # lemmatizer = nltk.stem.WordNetLemmatizer()
    # regtext = text.lower().replace('-\n', '')
    # custom_tokenize = functools.partial(sent_to_tokens, tokenizer=nltk.TweetTokenizer())
    # custom_tokenize = lambda sent: sent.split(' ')
    # tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = [nltk.regexp_tokenize(s, r'[^\s\n]+') for s in nltk.tokenize.sent_tokenize(text)]
    print(f'corpus size: {len(tokens)}')
    return tokens

tokens = basic_text_preprocess(text)
tokens[:2]

corpus size: 12460


[['First',
  'Citizen:',
  'Before',
  'we',
  'proceed',
  'any',
  'further,',
  'hear',
  'me',
  'speak.'],
 ['All:', 'Speak,', 'speak.']]

In [61]:
def build_vocab(tokens):
    tok2idx = collections.defaultdict(lambda: 0)
    unique_tokens = list(functools.reduce(lambda a, b: set(a).union(b), tokens))
    print(len(unique_tokens))
    if len(unique_tokens) > 1000:
        max_len = int(len(unique_tokens) * 1.)
        wc = collections.Counter(functools.reduce(lambda a, b: a + b, tokens))
        unique_tokens = [*map(lambda w: w[0], wc.most_common(max_len))]
    print(f'vocab size: {len(unique_tokens)}')
    idx2tok = ['<UNK>', '<START>', '<END>', '<PAD>'] + unique_tokens
    for i, tok in enumerate(idx2tok):
        tok2idx[tok] = i
    return tok2idx, idx2tok

tok2idx, idx2tok = build_vocab(tokens)

25670
vocab size: 25670


In [0]:
def pad_train(x, tok2idx, seq_len):
    x = [tok2idx['<START>']] + x
    if len(x) >= seq_len:
        return np.array(x[:seq_len], dtype=np.int32), seq_len
    pad_x = x + [tok2idx['<PAD>']] * (seq_len - len(x))
    return np.array(pad_x, dtype=np.int32), len(x)


def pad_target(y, tok2idx, seq_len):
    y = y + [tok2idx['<END>']]
    if len(y) >= seq_len:
        return np.array(y[:seq_len], dtype=np.int32), seq_len
    pad_y = y + [tok2idx['<PAD>']] * (seq_len - len(y))
    return np.array(pad_y, dtype=np.int32), len(y)


def batches_generator(batch_size, tokens, tok2idx, seq_len):
    n_samples = len(tokens)
    order = np.random.permutation(n_samples) # shuffle data
    n_batches = n_samples // batch_size + 1
    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list, y_list, max_len = [], [], 0
        for idx in order[batch_start:batch_end]:
            x_list.append([tok2idx[t] for t in tokens[idx][:-1]])
            y_list.append([tok2idx[t] for t in tokens[idx][1:]])
            max_len = max(max_len, len(tokens[idx]))
        X = np.ones([current_batch_size, max_len], dtype=np.int32) * tok2idx['<PAD>']
        Y = np.ones([current_batch_size, max_len], dtype=np.int32) * tok2idx['<PAD>']
        actual_lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            tok_len = len(x_list[n])
            actual_lengths[n] = tok_len
            X[n, :tok_len] = x_list[n]
            Y[n, :tok_len] = y_list[n]
        yield X, Y, actual_lengths

In [0]:
x, y, lens = next(batches_generator(16, tokens, tok2idx, 0))

In [0]:
class NeuralLanguageModel:
    pass

In [0]:
def declare_placeholders(self):
    self.input_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='input_batch')
    self.target_batch = tf.placeholder(dtype=tf.int32, shape=[None, None], name='target_batch')
    self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name='lengths')
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[], name='dropout_rate')
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate')

In [0]:
NeuralLanguageModel.__declare_placeholders = classmethod(declare_placeholders)

In [0]:
def build_layers(self, vocabulary_size, embedding_dim, n_hidden):
    initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)
    embedding_matrix = tf.Variable(initial_embedding_matrix, dtype=tf.float32, name='embedding_matrix')
    
    rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
    # rnn_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_hidden, activation=tf.nn.tanh)
    regulrized_rnn_cell = tf.nn.rnn_cell.DropoutWrapper(
        cell=rnn_cell, input_keep_prob=self.dropout_ph, output_keep_prob=self.dropout_ph,
        state_keep_prob=self.dropout_ph)
    
    mul_cell = tf.nn.rnn_cell.MultiRNNCell(cells=[regulrized_rnn_cell]*2)
    embeddings = tf.nn.embedding_lookup(embedding_matrix, self.input_batch)
    output, state = tf.nn.dynamic_rnn(cell=mul_cell,
                                      inputs=embeddings,
                                      sequence_length=self.lengths,
                                      dtype=tf.float32)
    self.logits = tf.layers.dense(output, vocabulary_size, activation=None)

In [0]:
NeuralLanguageModel.__build_layers = classmethod(build_layers)

In [0]:
def compute_predictions(self):
    softmax_output = tf.nn.softmax(self.logits)
    self.predictions = tf.argmax(softmax_output, axis=-1)
    self.probs = softmax_output

In [0]:
NeuralLanguageModel.__compute_predictions = classmethod(compute_predictions)

In [0]:
def compute_loss(self, vocabulary_size, pad_index):
    with tf.device('/device:GPU:0'):
        targets_one_hot = tf.one_hot(self.target_batch, vocabulary_size)
        loss_tensor = tf.nn.softmax_cross_entropy_with_logits_v2(labels=targets_one_hot,
                                                                logits=self.logits)
        mask = tf.cast(tf.not_equal(self.input_batch, pad_index), tf.float32)
        self.loss = tf.reduce_mean(tf.boolean_mask(loss_tensor, mask))

In [0]:
NeuralLanguageModel.__compute_loss = classmethod(compute_loss)

In [0]:
def perform_optimization(self):
    with tf.device('/device:GPU:0'):
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate_ph)
        grads_and_vars = self.optimizer.compute_gradients(self.loss)
        
        clip_norm = tf.cast(5.0, tf.float32)
        self.grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in grads_and_vars]
        
        self.train_op = self.optimizer.apply_gradients(self.grads_and_vars)

In [0]:
NeuralLanguageModel.__perform_optimization = classmethod(perform_optimization)

In [0]:
def init_model(self, vocabulary_size, embedding_dim, n_hidden, pad_index):
    self.__declare_placeholders()
    self.__build_layers(vocabulary_size, embedding_dim, n_hidden)
    self.__compute_predictions()
    self.__compute_loss(vocabulary_size, pad_index)
    self.__perform_optimization()

In [0]:
NeuralLanguageModel.__init__ = classmethod(init_model)

In [0]:
def train_on_batch(self, sess, x_batch, y_batch, lengths, learning_rate, dropout_keep_prob):
    feed_dict = {
        self.input_batch: x_batch,
        self.target_batch: y_batch,
        self.learning_rate_ph: learning_rate,
        self.dropout_ph: dropout_keep_prob,
        self.lengths: lengths
    }
    sess.run(self.train_op, feed_dict=feed_dict)
    return sess.run(self.loss, feed_dict=feed_dict)

In [0]:
NeuralLanguageModel.train_on_batch = classmethod(train_on_batch)

In [0]:
def predict_for_batch(self, sess, x_batch, lengths):
    feed_dict = {
        self.input_batch: x_batch,
        self.lengths: lengths
    }
    predictions = sess.run(self.predictions, feed_dict=feed_dict)
    return predictions

In [0]:
NeuralLanguageModel.predict_for_batch = classmethod(predict_for_batch)

In [0]:
tf.reset_default_graph()

model = NeuralLanguageModel(vocabulary_size=len(tok2idx),
                            embedding_dim=256,
                            n_hidden=256,
                            pad_index=tok2idx['<PAD>'])
batch_size = 50
n_epochs = 50
learning_rate = 0.002
learning_rate_decay = 1.03
dropout_keep_probability = 1.

In [82]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch in range(n_epochs):
    # For each epoch evaluate the model on train and validation data
    print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)
    
    # Train the model
    losses = []
    for x_batch, y_batch, lengths in batches_generator(batch_size, tokens, tok2idx, 100):
        batch_loss = model.train_on_batch(sess, x_batch, y_batch, lengths, learning_rate,
                                          dropout_keep_probability)
        losses.append(batch_loss * len(x_batch))
    print(f'traning loss: {np.sum(losses)/len(tokens)}')
        
    # Decaying the learning rate
    learning_rate = learning_rate / learning_rate_decay
    
print('...training finished.')

-------------------- Epoch 1 of 50 --------------------
traning loss: 8.198165399496284
-------------------- Epoch 2 of 50 --------------------
traning loss: 7.581930865062756
-------------------- Epoch 3 of 50 --------------------
traning loss: 7.137873792724855
-------------------- Epoch 4 of 50 --------------------
traning loss: 6.727495897639047
-------------------- Epoch 5 of 50 --------------------
traning loss: 6.400176626912663
-------------------- Epoch 6 of 50 --------------------
traning loss: 6.102600703461404
-------------------- Epoch 7 of 50 --------------------
traning loss: 5.82996601086367
-------------------- Epoch 8 of 50 --------------------
traning loss: 5.564170134010131
-------------------- Epoch 9 of 50 --------------------
traning loss: 5.309084876199787
-------------------- Epoch 10 of 50 --------------------
traning loss: 5.057772687311923
-------------------- Epoch 11 of 50 --------------------
traning loss: 4.8129616560560935
-------------------- Epoch 12 

In [84]:
input_seq = tokens[18]
print(input_seq)
x = [[tok2idx['<START>']] + [*map(tok2idx.get, input_seq)]]
feed_dict = {model.input_batch: x, model.lengths: [*map(len, x)]}
probs = sess.run(model.probs, feed_dict=feed_dict)
print([idx2tok[i] for i in probs[0].argmax(axis=1)])

['First', 'Citizen:', 'I', 'say', 'unto', 'you,', 'what', 'he', 'hath', 'done', 'famously,', 'he', 'did', 'it', 'to', 'that', 'end:', 'though', 'soft-conscienced', 'men', 'can', 'be', 'content', 'to', 'say', 'it', 'was', 'for', 'his', 'country', 'he', 'did', 'it', 'to', 'please', 'his', 'mother', 'and', 'to', 'be', 'partly', 'proud;', 'which', 'he', 'is,', 'even', 'till', 'the', 'altitude', 'of', 'his', 'virtue.']
['I', 'Citizen:', 'I', 'have', 'it', 'your', 'and', 'he', 'should', 'done', 'famously,', 'he', 'did', 'it', 'to', 'the', 'end:', 'though', 'soft-conscienced', 'men', 'can', 'be', 'content', 'to', 'say', 'it', 'is', 'an', 'his', 'country', 'he', 'did', 'lent', 'to', 'use', 'his', 'mother', 'and', 'to', 'have', 'of', 'proud;', 'which', 'he', 'is,', 'even', 'so', 'the', 'altitude', 'of', 'his', 'virtue.', 'By']


In [85]:
de = nltk.tokenize.treebank.TreebankWordDetokenizer()
de.detokenize([idx2tok[i] for i in probs[0].argmax(axis=1)])

'I Citizen: I have it your and he should done famously, he did it to the end: though soft-conscienced men can be content to say it is an his country he did lent to use his mother and to have of proud; which he is, even so the altitude of his virtue. By'

In [0]:
# def get_cond_prob(sess, model, seq_toks):
#     feed_dict = {
#         model.input_batch: seq_toks,
#         model.lengths: [len(seq_toks)+10] * len(seq_toks)
#     }
#     return np.log1p(sess.run(model.probs, feed_dict=feed_dict)[:, -1])


# def find_topk_2d(arr, topk):
# #     arr[:, 2] = 0
# #     arr[:, 6] = 0
# #     arr[:, 5] = 0
# #     arr[:, 7] = 0
# #     arr[:, 11] = 0
# #     arr[:, 4] = 0
#     order = arr.reshape(-1,).argsort()[::-1][:topk]
#     idx = np.unravel_index(order, arr.shape)
#     vals = []
#     for i in range(topk):
#         vals.append(arr[idx[0][i], idx[1][i]])
#     return vals, idx


# def init_beam_search(sess, model, start_toks, topk):
#     cond_probs = get_cond_prob(sess, model, start_toks)
#     vals, idx = find_topk_2d(cond_probs, topk)
#     seq_toks = np.column_stack((np.repeat(start_toks, topk, axis=0),
#                                 idx[1].reshape(-1,1)))
#     seq_probs = np.reshape(vals, (-1, 1))
#     return seq_toks, seq_probs


# def extend_seq(seq_toks, idx):
#     new_seq = []
#     for i in range(len(seq_toks)):
#         seq_idx = idx[0][i]
#         tok_idx = idx[1][i]
#         new_seq.append(np.array(list(seq_toks[seq_idx]) + [tok_idx]))
#     return np.array(new_seq)


# def calculate_seq_probs(seq_probs, cond_probs):
#     return cond_probs + seq_probs.reshape(-1, 1)
    

# def itrate_search(sess, seq_toks, seq_probs, topk):
#     cond_probs = get_cond_prob(sess, model, seq_toks)
#     seq_probs = calculate_seq_probs(seq_probs, cond_probs)
#     vals, idx = find_topk_2d(seq_probs, topk)
#     seq_toks = extend_seq(seq_toks, idx)
#     seq_probs = seq_probs[idx[0], idx[1]]
#     return seq_toks, seq_probs

# def beam_search(sess, model, start_toks, topk, idx2tok):
#     detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()
#     seq_toks, seq_probs = init_beam_search(sess, model, start_toks, topk)
#     for i in range(10):
#         seq_toks, seq_probs = itrate_search(sess, seq_toks, seq_probs, topk)
#     output_sents = []
#     for seq in seq_toks:
#         sent = detokenizer.detokenize([idx2tok[i] for i in seq])
#         output_sents.append(sent)
#     return output_sents, seq_toks, seq_probs