### Generating news headline with Recurrent Neural Network.

#### This notebook demonstrates NLP pipeline for news headline generation based on the framework explained in this  <a href="https://nlp.stanford.edu/courses/cs224n/2015/reports/1.pdf">paper</a>.

#### IMPORTS

In [28]:
import numpy as np
import csv
import random

# Chainer Framework
import chainer
from chainer import training
from chainer.training import extensions
import chainer.functions as F
import chainer.links as L
from chainer import reporter

#### MODEL DEFINITION

An LSTMBlock is a fundamental unit of the model described in the aforementioned paper.

In [29]:
class LSTMBlock(chainer.ChainList):
    def __init__(self, num_layers, num_hidden, dropout_ratio=0.5):
        self.num_layers = num_layers
        self.dropout_ratio = dropout_ratio

        layers = [L.LSTM(num_hidden, num_hidden) for _ in range(num_layers)]

        super(LSTMBlock, self).__init__(*layers)

    def reset_state(self):
        for i in range(self.num_layers):
            self[i].reset_state()

    def __call__(self, x, train):
        output = x
        for i in range(self.num_layers):
            output = self[i](
                F.dropout(
                    output,
                    ratio=self.dropout_ratio,
                    train=train
                )
            )
        return output

In [30]:
class LSTMNet(chainer.Chain):
    def __init__(
            self,
            num_rnn_layers, num_hidden,
            vocab_size,
            dropout_ratio=0.5):
        super(LSTMNet, self).__init__(
            embed=L.EmbedID(vocab_size, num_hidden),
            cell=LSTMBlock(num_rnn_layers, num_hidden, dropout_ratio),
        )

    def __call__(self, xs, train):
        self.cell.reset_state()
        embeddings = self.embed(xs)
        return [self.cell(embeddings[:, i, :], train) for i in
                range(embeddings.shape[1])]

    def predict(self, xs, ys, train):
        outputs = self.__call__(xs, ys, train)
        return self.linear(F.concat(outputs, axis=0))

In [31]:
class EncoderDecoder(chainer.Chain):
    def __init__(
            self,
            num_encode_rnn_layers, num_decode_rnn_layers, num_hidden,
            vocab_size,
            dropout_ratio=0.5):
        super(EncoderDecoder, self).__init__(
            encoder=LSTMNet(
                        num_encode_rnn_layers, num_hidden, vocab_size,
                        dropout_ratio
                    ),
            decoder_cell=LSTMBlock(
                        num_decode_rnn_layers, num_hidden,
                        dropout_ratio
                    ),
            linear=L.Linear(num_hidden, vocab_size)
        )

    def __call__(self, xs, ys, train):
        encoder_hidden = self.encoder(xs, train)[-1]

        self.decoder_cell.reset_state()
        ys_embeddings = self.encoder.embed(ys)
        length = ys_embeddings.shape[1]
        outputs = []
        output = self.xp.zeros(
                    (xs.shape[0], self.decoder_cell[0].state_size),
                    dtype=self.xp.float32
                )
        for i in range(length):
            if i > 0:
                decoder_inputs = encoder_hidden + ys_embeddings[:, i - 1, :]
            else:
                decoder_inputs = encoder_hidden
            output = self.decoder_cell(decoder_inputs, train)
            outputs.append(output)
        return outputs

    def predict(self, xs, ys, train):
        outputs = self.__call__(xs, ys, train)
        return self.linear(F.concat(outputs, axis=0))

In [32]:
class GlobalAttention(chainer.Chain):
    def __init__(self, num_hidden):
        super(GlobalAttention, self).__init__(
            w1=L.Linear(num_hidden, num_hidden),
            w2=L.Linear(num_hidden, num_hidden),
            v=L.Linear(num_hidden, 1)
        )
        self.encoder_hiddens = None
        self.w1hi = None

    def __call__(self, output_hidden, length):
        batch_size = output_hidden.shape[0]
        num_hidden = output_hidden.shape[1]

        w2dt = F.broadcast_to(
                self.w2(output_hidden),
                shape=(length, batch_size, num_hidden)
            )
        w1hi_plus_w2dt = self.w1hi + w2dt
        w1hi_plus_w2dt = F.swapaxes(w1hi_plus_w2dt, 0, 1)
        w1hi_plus_w2dt = F.reshape(
                            w1hi_plus_w2dt,
                            shape=(batch_size * length, -1)
                        )

        logits = F.reshape(
                    self.v(F.tanh(w1hi_plus_w2dt)),
                    shape=(batch_size, -1)
                )

        probs = F.broadcast_to(
                    F.softmax(logits),
                    shape=(num_hidden, batch_size, length)
                )

        probs = F.swapaxes(probs, 0, 2)

        return F.sum(self.encoder_hiddens * probs, axis=0)

    def precompute(self, encoder_hiddens):
        length = len(encoder_hiddens)
        batch_size = encoder_hiddens[0].shape[0]
        self.encoder_hiddens = F.stack(encoder_hiddens)
        self.w1hi = F.reshape(
                self.w1(F.reshape(self.encoder_hiddens,
                        shape=(length * batch_size, -1))),
                shape=(length, batch_size, -1))


In [33]:
class AttentionalEncoderDecoder(EncoderDecoder):
    def __init__(
            self, attention_model, num_encode_rnn_layers,
            num_decode_rnn_layers, num_hidden, vocab_size, dropout_ratio=0.5):
        super(AttentionalEncoderDecoder, self).__init__(
                num_encode_rnn_layers,
                num_encode_rnn_layers, num_hidden, vocab_size, dropout_ratio
            )
        self.add_link("attention", attention_model)

    def __call__(self, xs, ys, train):
        self.attention.precompute(self.encoder(xs, train))

        self.decoder_cell.reset_state()
        ys_embeddings = self.encoder.embed(ys)
        length = ys_embeddings.shape[1]
        outputs = []
        output = self.xp.zeros(
                    (xs.shape[0], self.decoder_cell[0].state_size),
                    dtype=self.xp.float32
                )
        for i in range(length):
            decoder_inputs = self.attention(output, length)
            if i > 0:
                decoder_inputs += ys_embeddings[:, i - 1, :]
            output = self.decoder_cell(decoder_inputs, train)
            outputs.append(output)
        return outputs

#### CLASSIFIER

In [34]:
class HeadlineGeneratorClassifier(chainer.Chain):
    def __init__(self, predictor):
        super(HeadlineGeneratorClassifier, self).__init__(predictor=predictor)

    def __call__(self, xs, ys, train):
        decoder_logits = self.predictor.predict(xs, ys, train)
        labels = F.flatten(F.transpose(ys))
        loss = F.softmax_cross_entropy(decoder_logits, labels)
        accuracy = F.accuracy(decoder_logits, labels)
        reporter.report({"loss": loss, "accuracy": accuracy}, self)
        return loss


#### DATASET ITERATOR

In [35]:
class DSIterator(chainer.dataset.Iterator):
    def __init__(
            self,
            xp,
            dataset,
            batch_size,
            eos, pad,
            shuffle=True, repeat=True):
        self.xp = xp
        self.dataset = dataset
        self.size = len(dataset)
        self.batch_size = batch_size
        assert batch_size <= self.size
        self.eos = eos
        self.pad = pad
        self.shuffle = shuffle
        self.repeat = repeat

        self.epoch = 0
        self.is_new_epoch = False
        self.iteration = 0
        self.offset = 0

    def __next__(self):
        self.is_new_epoch = (self.offset == 0)
        if self.is_new_epoch:
            self.epoch += 1
            if self.shuffle:
                random.shuffle(self.dataset)
            if not self.repeat and self.epoch > 1:
                raise StopIteration

        next_offset = min(self.size, self.offset + self.batch_size)
        batch = self.dataset[self.offset: next_offset]
        assert len(batch) > 0
        assert len(batch) == self.batch_size or (next_offset == self.size and
                len(batch) == self.size - self.offset)
        self.offset = next_offset if next_offset < self.size else 0

        # Padding
        max_x_length = max([len(pair[0]) for pair in batch])
        max_y_length = max([len(pair[1]) for pair in batch])

        x_batch = []
        y_batch = []
        for x, y in batch:
            x_batch.append(
                x + [self.eos] + [self.pad] * (max_x_length - len(x))
            )
            y_batch.append(
                y + [self.eos] + [self.pad] * (max_y_length - len(y))
            )
        x_batch = self.xp.array(x_batch, dtype=np.int32)
        y_batch = self.xp.array(y_batch, dtype=np.int32)
        return x_batch, y_batch

    @property
    def epoch_detail(self):
        return self.epoch + (self.offset * 1.0 / self.size)

    def serialize(self, serializer):
        self.iteration = serializer("iteration", self.iteration)
        self.epoch = serializer("self.epoch", self.epoch)
        self.offset = serializer("self.offset", self.offset)

#### UPDATER

In [36]:

class HeadlineGeneratorUpdater(chainer.training.StandardUpdater):
    def __init__(self, data_iter, optimizer, device):
        super(HeadlineGeneratorUpdater, self).__init__(
            data_iter, optimizer, device=device
        )

    def update_core(self):
        data_iter = self.get_iterator("main")
        optimizer = self.get_optimizer("main")
        x_batch, y_batch = data_iter.__next__()
        loss = optimizer.target(x_batch, y_batch, train=True)
        optimizer.target.cleargrads()
        loss.backward()
        optimizer.update()


### TRAINING THE MODEL

#### CONSTANTS

In [43]:
TRAIN_SIZE = 0.7  # SPLITTING DATASET INTO TRAINING.  
VALID_SIZE = 0.3  # AND VALIDATION.
MAX_STRING_LEN = 140
NUM_ENCODE_LAYERS = 3 # NUMBER OF LSTM BLOCKS IN THE ENCODER
NUM_DECODE_LAYERS = 3 # NUMBER OF LSTM BLOCKS IN THE DECODER.
NUM_HIDDEN = 128 # HIDDEN UNITS IN A LSTM BLOCK.
BATCH_SIZE = 1 # BATCH SIZE 1 FOR CPU, 128 FOR GPU.
NUM_EPOCHS = 20 
DROPOUT = 0.5
DEVICE = -1 # -1 = CPU; [0, N] FOR GPU.
DS_PATH = './dataset/news_summary.csv'
GRADIENT_CLIP = 5
LOG = "./log"
EOS = "<eos>"
PAD = "<pad>"

#### GENERATING DATASET

In [None]:
def gen_word_id_map(path):
    word_id_map = {}
    with open(path, 'r', encoding='latin-1') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            complete_text = row['ctext'].lower()
            summary = row['text'].lower()
            for _idx, x in enumerate(complete_text.split(' ')):
                if x not in word_id_map:
                    word_id_map[x] = len(word_id_map)

            for _idx, x in enumerate(summary.split(' ')):
                if x not in word_id_map:
                    word_id_map[x] = len(word_id_map)
    word_id_map[EOS] = len(word_id_map)
    word_id_map[PAD] = len(word_id_map)
    return word_id_map


def gen_dataset(word_id_map, path):
    train = []
    with open(path, 'r', encoding='latin-1') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            complete_text = row['ctext'].lower()
            summary = row['text'].lower()
            inp = [
                    word_id_map[x]
                    for _idx, x in enumerate(complete_text.split(' '))
                ]
            tar = [
                    word_id_map[x]
                    for _idx, x in enumerate(summary.split(' '))
                ]
            if len(inp) > len(tar):
                padding = [word_id_map[EOS]] + [word_id_map[PAD]]*(len(inp) - len(tar)-1)
                tar += padding

            if len(inp) == len(tar):
                train.append(
                    [tar, tar]
                )

    return train[0: int(len(train)*TRAIN_SIZE)],\
        train[int(len(train)*TRAIN_SIZE): -1]

####  CHAINER DRIVER FOR LOSS REPORTER

In [None]:
word_id_map = gen_word_id_map(DS_PATH)
VOCAB_SIZE = len(word_id_map)


train_set, val_set = gen_dataset(word_id_map, DS_PATH)


attention_model = GlobalAttention(NUM_HIDDEN)
model = EncoderDecoder(
            NUM_ENCODE_LAYERS,
            NUM_DECODE_LAYERS,
            NUM_HIDDEN,
            VOCAB_SIZE,
            DROPOUT
        )
model = HeadlineGeneratorClassifier(model)


xp = np


# OPTIMIZER
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
optimizer.add_hook(
    chainer.optimizer.GradientClipping(GRADIENT_CLIP)
)


# DS ITERATORS.
train_iter = DSIterator(
    xp,
    train_set,
    BATCH_SIZE,
    word_id_map[EOS],
    word_id_map[PAD],
    shuffle=False,
    repeat=True
)

val_iter = DSIterator(
    xp,
    val_set,
    BATCH_SIZE,
    word_id_map[EOS],
    word_id_map[PAD],
    shuffle=False,
    repeat=False
)


# UPDATER
updater = HeadlineGeneratorUpdater(
    train_iter,
    optimizer,
    DEVICE
)


# TRAINER
trainer = training.Trainer(
    updater,
    stop_trigger=(NUM_EPOCHS, "epoch"),
    out=LOG
)


# EXTENSIONS.
trainer.extend(
    extensions.Evaluator(
        val_iter, model, device=DEVICE,
        eval_func=lambda batch: model(batch[0], batch[1], train=False)
    )
)

interval = 1

trainer.extend(
    extensions.LogReport(
        postprocess=lambda result: compute_loss(result, TRAIN_SIZE),
        trigger=(interval, "epoch")
    )
)

trainer.extend(
    extensions.PrintReport(
        [
            "epoch", "iteration",
            "train_loss", "train_acc",
            "val_loss", "val_acc"
        ]
    ),
    trigger=(interval, "epoch")
)

trainer.extend(
    extensions.ProgressBar(update_interval=1)
)

trainer.extend(
    extensions.snapshot(trigger=(interval, "epoch"))
)

trainer.extend(
    extensions.snapshot_object(model, "model_epoch_{.updater.epoch}")
)

trainer.run()


364 364
402 402
333 333
404 404
526 526
370 370
404 404
581 581
231 231
249 249
354 354
439 439
372 372
937 937
148 148
359 359
350 350
129 129
250 250
540 540
316 316
492 492
327 327
378 378
382 382
300 300
215 215
150 150
167 167
1132 1132
203 203
159 159
247 247
841 841
203 203
124 124
408 408
278 278
348 348
510 510
232 232
508 508
364 364
280 280
214 214
603 603
165 165
349 349
256 256
166 166
168 168
306 306
173 173
722 722
408 408
288 288
592 592
664 664
585 585
522 522
373 373
262 262
345 345
325 325
324 324
174 174
429 429
478 478
376 376
183 183
429 429
559 559
410 410
315 315
257 257
343 343
361 361
599 599
110 110
292 292
237 237
295 295
298 298
345 345
400 400
386 386
298 298
206 206
484 484
374 374
107 107
436 436
115 115
318 318
280 280
80 80
398 398
123 123
317 317
110 110
949 949
392 392
94 94
638 638
205 205
183 183
309 309
242 242
592 592
305 305
101 101
257 257
290 290
409 409
337 337
886 886
68 68
509 509
206 206
1520 1520
468 468
593 593
285 285
311 311
213 213
56