In [2]:
import warnings
warnings.filterwarnings('ignore')

import random
import time
import multiprocessing as mp
import numpy as np

import mxnet as mx
from mxnet import nd, gluon, autograd

import gluonnlp as nlp
import pickle

random.seed(123)
np.random.seed(123)
mx.random.seed(123)

In [3]:
train_dataset = pickle.load(open('../data/dev_processed.p', 'rb'))
test_dataset = pickle.load(open('../data/dev_processed.p', 'rb'))

In [4]:
vocabulary = {'<pad>': [0, 1], '<unk>': [1, 1], '<BOS>': [2, 1], '<EOS>': [3, 1]}
for item in train_dataset + test_dataset:
    words = item[2].split(' ')
    for word in words:
        if word in vocabulary:
            vocabulary[word][1] += 1
        else:
            vocabulary[word] = [len(vocabulary), 1]

vocabulary_inv = {}
for key in vocabulary:
    vocabulary_inv[vocabulary[key][0]] = key

In [5]:
def preprocess(x):
    name, audio, words = x
    split_words = ['<BOS>'] + words.split(' ') + ['<EOS>']
    return audio, np.array([vocabulary[word][0] for word in split_words]), float(len(audio)), float(len(split_words))

def get_length(x):
    return float(len(x[1]))

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
    end = time.time()
    print('Done! Processing Time={:.2f}s, #Samples={}'.format(end - start, len(dataset)))
    return dataset, lengths

In [6]:
train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
test_dataset, test_data_lengths = preprocess_dataset(test_dataset)

Done! Processing Time=2.56s, #Samples=2703
Done! Processing Time=2.16s, #Samples=2703


In [7]:
learning_rate, batch_size = 0.005, 32
bucket_num, bucket_ratio = 10, 0.2
grad_clip = None
log_interval = 5

def get_dataloader():
    batchify_fn = nlp.data.batchify.Tuple(
        nlp.data.batchify.Pad(dtype='float32'),
        nlp.data.batchify.Pad(dtype='float32'),
        nlp.data.batchify.Stack(dtype='float32'),
        nlp.data.batchify.Stack(dtype='float32'))
    batch_sampler = nlp.data.sampler.FixedBucketSampler(
        train_data_lengths,
        batch_size=batch_size,
        num_buckets=bucket_num,
        ratio=bucket_ratio,
        shuffle=True)
    print(batch_sampler.stats())

    train_dataloader = gluon.data.DataLoader(
        dataset=train_dataset,
        batch_sampler=batch_sampler,
        batchify_fn=batchify_fn)
    test_dataloader = gluon.data.DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=False,
        batchify_fn=batchify_fn)
    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = get_dataloader()

FixedBucketSampler:
  sample_num=2703, batch_num=82
  key=[13, 22, 31, 40, 49, 58, 67, 76, 85, 94]
  cnt=[837, 805, 531, 282, 120, 70, 30, 15, 8, 5]
  batch_size=[46, 32, 32, 32, 32, 32, 32, 32, 32, 32]


In [8]:
context = mx.cpu()

In [9]:
import numpy as np
import mxnet as mx
from io import open
from mxnet import gluon, autograd
from mxnet.gluon import nn, rnn, Block
from mxnet import ndarray as F
from gluonnlp.model.transformer import TransformerEncoder
from gluonnlp.model.transformer import TransformerDecoder

class SubSampler(gluon.HybridBlock):
    def __init__(self, size=3, prefix=None, params=None):
        super(SubSampler, self).__init__(prefix=prefix, params=params)
        self.size = size

    def forward(self, data, valid_length):
        masked_encoded = F.SequenceMask(data,
                                        sequence_length=valid_length,
                                        use_sequence_length=True)
        subsampled = F.Pooling(masked_encoded.swapaxes(0,2), kernel=(self.size), pool_type='max', stride=self.size).swapaxes(0,2)
        sub_valid_length = mx.nd.ceil(valid_length / self.size)
        return subsampled, sub_valid_length

In [10]:
class AudioEncoder(Block):
    def __init__(self, input_size, hidden_size, sub_sample_size=3):
        super(AudioEncoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.sub_sample_size = sub_sample_size

        with self.name_scope():
            self.proj = nn.Dense(hidden_size, flatten=False)
            self.t1 = TransformerEncoder(units=self.hidden_size, num_layers=1, hidden_size=16, max_length=50, num_heads=1)
            self.subsampler = SubSampler(self.sub_sample_size)
            self.t2 = TransformerEncoder(units=self.hidden_size, num_layers=1, hidden_size=16, max_length=50, num_heads=1)

    def forward(self, input, lengths):
        input = self.proj(input)
        output, _ = self.t1(input, None, lengths)
        output = output.swapaxes(0,1)
        subsampled, sub_lengths = self.subsampler(output, lengths)
        subsampled = subsampled.swapaxes(0,1)
        output, _ = self.t2(subsampled, None, sub_lengths)
        return output, sub_lengths

In [11]:
class AudioWordDecoder(Block):
    def __init__(self, output_size, hidden_size):
        super(AudioWordDecoder, self).__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size

        with self.name_scope():
            self.embedding = nn.Embedding(output_size, hidden_size)
            self.t = TransformerDecoder(units=self.hidden_size, num_layers=1, hidden_size=16, max_length=50, num_heads=1)
            self.out = nn.Dense(output_size, in_units=self.hidden_size, flatten=False)

    def forward(self, input, enc_outs, enc_valid_lengths, dec_valid_lengths):
        output = self.embedding(input)
        dec_states = self.t.init_state_from_encoder(enc_outs, enc_valid_lengths)
        output, _, _ = self.t.decode_seq(output, dec_states, dec_valid_lengths)
        output = self.out(output)
        return output 

In [12]:
class Seq2Seq(Block):
    def __init__(self, input_size, output_size, enc_hidden_size, dec_hidden_size):
        super(Seq2Seq, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size
        with self.name_scope():
            self.encoder = AudioEncoder(input_size=input_size, hidden_size=enc_hidden_size)
            self.decoder = AudioWordDecoder(hidden_size=dec_hidden_size, output_size=output_size)
    
    def forward(self, audio, alengths, words, wlengths):
        encoder_outputs, encoder_out_lengths = self.encoder(audio, alengths)
        decoder_outputs = self.decoder(words, encoder_outputs, encoder_out_lengths, wlengths)
        return decoder_outputs

In [13]:
from mxnet.gluon.loss import Loss as Loss

class SoftmaxSequenceCrossEntropyLoss(Loss):
    def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
                 batch_axis=0, **kwargs):
        super(SoftmaxSequenceCrossEntropyLoss, self).__init__(
            weight, batch_axis, **kwargs)
        self._axis = axis
        self._sparse_label = sparse_label
        self._from_logits = from_logits
    
    def hybrid_forward(self, F, pred, label, valid_length):
        if not self._from_logits:
            pred = F.log_softmax(pred, self._axis)
        loss = mx.nd.squeeze(-F.pick(pred, label, axis=self._axis, keepdims=True), axis=2)
        loss = F.SequenceMask(loss.swapaxes(0,1), 
                              sequence_length=valid_length,
                              use_sequence_length=True).swapaxes(0,1)
        return F.mean(loss, axis=self._batch_axis, exclude=True)

In [14]:
net = Seq2Seq(input_size=13, output_size=8337, enc_hidden_size=16, dec_hidden_size=16)
net.initialize(mx.init.Xavier(), ctx=context)

class beamDecoder(object):
    def __init__(self, model):
        self._model = model
    def __call__(self, outputs, dec_states):
        outputs = self._model.decoder.embedding(outputs)
        outputs, new_states, _ = self._model.decoder.t(outputs, dec_states)
        return self._model.decoder.out(outputs), new_states

scorer = nlp.model.BeamSearchScorer(alpha=0, K=5, from_logits=False)
eos_id = vocabulary['<EOS>'][0]
beam_sampler = nlp.model.BeamSearchSampler(beam_size=5,
                                           decoder=beamDecoder(net),
                                           eos_id=eos_id,
                                           scorer=scorer,
                                           max_length=20)

In [15]:
def get_sequence_accuracy(s1, l1, s2, l2):
    s1 = mx.nd.cast(s1, dtype='int32')
    l1 = mx.nd.cast(l1, dtype='int32')
    s2 = mx.nd.cast(s2, dtype='int32')
    l2 = mx.nd.cast(l2, dtype='int32')
    padding = mx.nd.zeros((s1.shape[0], abs(s1.shape[1] - s2.shape[1])), dtype=s2.dtype)
    if s1.shape[1] > s2.shape[1]:
        s2 = mx.nd.concat(s2, padding, dim=1)
    else:
        s1 = mx.nd.concat(s1, padding, dim=1) 
    accs = F.SequenceMask((s1 == s2).swapaxes(0,1), 
                          sequence_length=mx.nd.minimum(l1, l2), 
                          use_sequence_length=True)
    return (mx.nd.cast(accs.sum(), dtype='float32')/mx.nd.cast(l1.sum(), dtype='float32')).asnumpy().item()

In [16]:
def evaluate(net, context):
    for i, (audio, words, alength, wlength) in enumerate(test_dataloader):
        encoder_outputs, encoder_out_lengths = net.encoder(audio.as_in_context(context), alength)
        outputs = mx.nd.array([2] * words.shape[0])
        decoder_states = net.decoder.t.init_state_from_encoder(encoder_outputs, encoder_out_lengths)
        samples, scores, valid_lengths = beam_sampler(outputs, decoder_states)
        best_samples = samples[:,0,1:-1]
        best_vlens = valid_lengths[:,0]
        return get_sequence_accuracy(words, wlength, best_samples, best_vlens)

In [17]:
def train(net, context, epochs):
    trainer = gluon.Trainer(net.collect_params(), 'ftml',
                            {'learning_rate': learning_rate})
    loss = SoftmaxSequenceCrossEntropyLoss()

    parameters = net.collect_params().values()

    for epoch in range(epochs):
        start_epoch_time = time.time()
        epoch_L = 0.0
        epoch_sent_num = 0
        epoch_wc = 0

        start_log_interval_time = time.time()
        log_interval_wc = 0
        log_interval_sent_num = 0
        log_interval_L = 0.0

        for i, (audio, words, alength, wlength) in enumerate(train_dataloader):
            wc = alength.sum().asscalar()
            log_interval_wc += wc
            epoch_wc += wc
            log_interval_sent_num += audio.shape[1]
            epoch_sent_num += audio.shape[1]
            with autograd.record():
                decoder_outputs = net(audio.as_in_context(context), alength, words.as_in_context(context), wlength)
                L = loss(decoder_outputs, words.as_in_context(context), wlength).sum()
            L.backward()
            
            if grad_clip:
                gluon.utils.clip_global_norm(
                    [p.grad(context) for p in parameters],
                    grad_clip)
            
            trainer.step(1)
            log_interval_L += L.asscalar()
            epoch_L += L.asscalar()
            if (i + 1) % log_interval == 0:
                print(
                    '[Epoch {} Batch {}/{}] elapsed {:.2f} s, '
                    'avg loss {:.6f}, throughput {:.2f}K fps'.format(
                        epoch, i + 1, len(train_dataloader),
                        time.time() - start_log_interval_time,
                        log_interval_L / log_interval_sent_num, log_interval_wc
                        / 1000 / (time.time() - start_log_interval_time)))
                start_log_interval_time = time.time()
                log_interval_wc = 0
                log_interval_sent_num = 0
                log_interval_L = 0
        
        end_epoch_time = time.time()
        test_acc = evaluate(net, context)
        print('[Epoch {}] train avg loss {:.6f}, test acc {:.2f}, '
              'throughput {:.2f}K fps'.format(
              epoch, epoch_L / epoch_sent_num, test_acc,
              epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))

In [None]:
train(net, context, 1)

In [19]:
def generate_sequences(sampler, inputs, begin_states, num_print_outcomes):
    samples, scores, valid_lengths = sampler(inputs, begin_states)
    print('Generation Result:')
    
    for sample_id in range(samples.shape[0]):
        sample = samples[sample_id].asnumpy()
        score = scores[sample_id].asnumpy()
        valid_length = valid_lengths[sample_id].asnumpy()

        for i in range(num_print_outcomes):
            sentence = []
            for ele in sample[i][:valid_length[i]]:
                sentence.append(vocabulary_inv[ele])
            print([' '.join(sentence), score[i]])

In [21]:
beam_sampler = nlp.model.BeamSearchSampler(beam_size=5,
                                           decoder=beamDecoder(net),
                                           eos_id=eos_id,
                                           scorer=scorer,
                                           max_length=20)

inputs = mx.nd.array([2] * 8)
begin_states = mx.nd.random.normal(0, 1, shape=(8, 1, 16))
decoder_states = net.decoder.t.init_state_from_encoder(begin_states)
generate_sequences(beam_sampler, inputs, decoder_states, 1)

Generation Result:
['<BOS> A IN IN IN A A A IN IN IN IN IN IN IN IN IN A A A A <EOS>', -165.40189]
['<BOS> IN IN IN IN THE ALL ALL IN IN IN IN ALL ALL ALL IN IN A A A A <EOS>', -166.51385]
['<BOS> THE IN IN IN IN IN IN IN IN IN IN IN IN IN IN IN IN THE THE THE <EOS>', -163.97751]
['<BOS> THE THE THE THE THE THE THE THE THE THE THE THE THE THE THE THE THE THE THE THE <EOS>', -164.52054]
['<BOS> WAS WAS WAS WAS WAS WAS SO SO SO SO SO SO SO SO SO SO WAS WAS WAS WAS <EOS>', -163.64218]
['<BOS> IN IN IN IN IN IN IN IN IN IN IN IN IN IN IN IN SHE SHE SHE SHE <EOS>', -165.07233]
['<BOS> HAD HAD HAD HAD HAD HAD HAD HAD HAD HAD HAD HAD HAD HAD SHE SHE HIS HIS HIS HIS <EOS>', -169.05925]
['<BOS> IN IN IN IN IN IN IN IN IN IN IN IN IN IN IN IN IN A A A <EOS>', -164.42769]
