# Data

In [4]:
import io
import json
import os

from torchtext import data
from torchtext.vocab import GloVe
import spacy

In [5]:
class SQuAD(object):
    def __init__(self, squad_version="1.1", word_vec_dim=100, train_batch_size=60, dev_batch_size=60, gpu=0):
        self.spacy = spacy.load('en')
        train_file = f'train-v{squad_version}.json'
        dev_file = f'dev-v{squad_version}.json'
        raw_dir = os.path.join('data', 'raw')
        processed_dir = os.path.join('data', 'processed')
        if not os.path.exists(os.path.join(processed_dir, train_file)):
            self.pre_process(raw_dir, train_file, processed_dir)
        if not os.path.exists(os.path.join(processed_dir, dev_file)):
            self.pre_process(raw_dir, dev_file, processed_dir)

        self.CHAR_NESTING = data.Field(batch_first=True, lower=True, tokenize=list)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=self.tokenizer)
        self.WORD = data.Field(batch_first=True, include_lengths=True, lower=True, tokenize=self.tokenizer)
        self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False)

        dict_fields = {'context': [('x_word', self.WORD), ('x_char', self.CHAR)],
                       'query': [('q_word', self.WORD), ('q_char', self.CHAR)],
                       'p_begin': ('p_begin', self.LABEL),
                       'p_end': ('p_end', self.LABEL)}

        train, dev = data.TabularDataset.splits(path=processed_dir,
                                                train=train_file,
                                                validation=dev_file,
                                                format='json',
                                                fields=dict_fields)

        self.CHAR.build_vocab(train, dev)
        self.WORD.build_vocab(train, dev, vectors=GloVe(name='6B', dim=word_vec_dim))
        self.train_iter, self.dev_iter = data.BucketIterator.splits(
            (train, dev),
            batch_sizes=[train_batch_size, dev_batch_size],
            device=gpu,
            sort_key=lambda x: len(x.c_word))

    def tokenizer(self, text):
        return [t.text for t in self.spacy.tokenizer(text)]

    def pre_process(self, input_dir, input_file, output_dir):
        in_filename = os.path.join(input_dir, input_file)
        out = []
        with io.open(in_filename, 'r', encoding='utf-8', errors='ignore') as f:
            data = json.load(f)['data']
            for article in data:
                for paragraph in article['paragraphs']:
                    context = paragraph['context']
                    tokens = self.tokenizer(context)
                    for qa in paragraph['qas']:
                        question = qa['question']
                        for ans in qa['answers']:
                            s_idx = ans['answer_start']
                            e_idx = s_idx + len(ans['text'])
                            cum_len = 0
                            p_begin = -1
                            p_end = -1
                            answer = ""
                            for i, t in enumerate(tokens):
                                while context[cum_len] == ' ':
                                    cum_len += 1
                                if p_begin == -1 and s_idx <= cum_len:
                                    p_begin = i
                                if p_begin != -1:
                                    if len(answer) > 0:
                                        answer += ' '
                                    answer += t
                                cum_len += len(t)
                                if p_end == -1 and e_idx <= cum_len:
                                    p_end = i
                                    if p_begin == -1:
                                        p_begin = i
                                    break
                            assert p_begin != -1 and p_end != -1
                            out.append(dict([('context', context),
                                             ('query', question),
                                             ('answer', answer),
                                             ('p_begin', p_begin),
                                             ('p_end', p_end)]))

        out_filename = os.path.join(output_dir, input_file)
        with open(out_filename, 'w', encoding='utf-8') as f:
            for o in out:
                json.dump(o, f)
                f.write('\n')

In [6]:
squad_data = SQuAD(squad_version='1.1')

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


# Model

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from copy import deepcopy

In [8]:
def copy_module(module, N):
    return nn.ModuleList([deepcopy(module) for _ in range(N)])

In [9]:
class Linear(nn.Module):
    def __init__(self, input_dim, output_dim, dropout):
        super(Linear, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [10]:
class HighwayMLP(nn.Module):
    def __init__(self, input_size, output_size, num_layer=2):
        super(HighwayMLP, self).__init__()
        self.num_layer = num_layer
        self.gate = copy_module(
            nn.Sequential(nn.Linear(input_size, output_size), nn.Sigmoid()),
            num_layer)
        self.transform = copy_module(
            nn.Sequential(nn.Linear(input_size, output_size), nn.ReLU()),
            num_layer)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=-1)
        for i in range(self.num_layer):
            t = self.transform[i](x)
            g = self.gate[i](x)
            x = t * g + (1-g) * x
        return x

In [11]:
class SingleLayerLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, bidirectional, dropout):
        super(SingleLayerLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=1,
                            batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, x_len):
        x = self.dropout(x)

        sorted_x_len, x_idx = torch.sort(x_len, descending=True)
        sorted_x = x.index_select(dim=0, index=x_idx)
        _, x_ori_idx = torch.sort(x_idx)

        x_packed = nn.utils.rnn.pack_padded_sequence(sorted_x, sorted_x_len, batch_first=True)
        x_packed, _ = self.lstm(x_packed, None)

        x, _ = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)
        x = x.index_select(dim=0, index=x_ori_idx)
        return x

In [12]:
class CharCNN(nn.Module):
    def __init__(self, char_emb_dim, char_vocab_size, channel_num, channel_width, dropout):
        super(CharCNN, self).__init__()
        self.char_emb_dim = char_emb_dim
        self.channel_num = channel_num
        self.char_embedding = nn.Embedding(char_vocab_size, char_emb_dim)
        self.char_cnn = nn.Conv2d(1, channel_num, (channel_width, char_emb_dim))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        batch_len = x.size(0)
        seq_len = x.size(1)
        x = self.dropout(self.char_embedding(x))                            # (batch_len, seq_len, word_len, char_dim)
        x = x.view(batch_len*seq_len, -1, self.char_emb_dim).unsqueeze(1)   # (batch * seq_len, 1, char_dim, word_len)
        x = self.char_cnn(x).squeeze()                                      # (batch * seq_len, channel_num, convolved)
        x = F.max_pool1d(x, x.size(-1)).squeeze()                           # (batch * seq_len, channel_num)
        x = x.view(batch_len, seq_len, self.channel_num)                    # (batch, seq_len, channel_num)
        return x

In [25]:
class BiDAF(nn.Module):
    def __init__(self, pretrain_embedding, char_vocab_size,
                 hidden_size=100, char_emb_dim=8, char_channel_num=100, char_channel_width=5, dropout=0.2):
        super(BiDAF, self).__init__()
        self.char_emb = CharCNN(char_emb_dim=char_emb_dim,
                                char_vocab_size=char_vocab_size,
                                channel_num=char_channel_num,
                                channel_width=char_channel_width,
                                dropout=dropout)
        self.word_emb = nn.Embedding.from_pretrained(pretrain_embedding, freeze=True)
        self.highway = HighwayMLP(input_size=hidden_size*2,
                                  output_size=hidden_size*2,
                                  num_layer=2)
        self.contextual_emb = SingleLayerLSTM(input_size=hidden_size*2,
                                              hidden_size=hidden_size,
                                              bidirectional=True, dropout=dropout)
        self.ws_h = Linear(hidden_size * 2, 1, dropout)
        self.ws_u = Linear(hidden_size * 2, 1, dropout)
        self.ws_hu = Linear(hidden_size * 2, 1, dropout)
        self.modeling_lstm_1 = SingleLayerLSTM(input_size=hidden_size * 8,
                                               hidden_size=hidden_size,
                                               bidirectional=True, dropout=dropout)
        self.modeling_lstm_2 = SingleLayerLSTM(input_size=hidden_size * 2,
                                               hidden_size=hidden_size,
                                               bidirectional=True, dropout=dropout)
        self.output_lstm = SingleLayerLSTM(input_size=hidden_size * 2,
                                           hidden_size=hidden_size,
                                           bidirectional=True, dropout=dropout)
        self.wp1_g = Linear(hidden_size * 8, 1, dropout=dropout)
        self.wp1_m = Linear(hidden_size * 2, 1, dropout=dropout)
        self.wp2_g = Linear(hidden_size * 8, 1, dropout=dropout)
        self.wp2_m = Linear(hidden_size * 2, 1, dropout=dropout)

    def bidaf(self, h, u):
        t = h.size(1)  # x_len
        j = u.size(1)  # q_len
        hh = h.unsqueeze(2).repeat(1, 1, j, 1)  # (batch, x_len, q_len, hidden*2)
        uu = u.unsqueeze(1).repeat(1, t, 1, 1)  # (batch, x_len, q_len, hidden*2)
        s = self.ws_h(hh) + self.ws_u(uu) + self.ws_hu(hh * uu)  # (batch, x_len, q_len)
        s = s.squeeze()
        
        a = F.softmax(s, dim=2)     # (batch, x_len, q_len)
        c2q_att = torch.bmm(a, u)   # (batch, x_len, hidden*2)

        b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1)  # (batch, 1, x_len)
        q2c_att = torch.bmm(b, h).squeeze()                        # (batch, hidden*2)
        q2c_att = q2c_att.unsqueeze(1).repeat(1, t, 1)             # (batch, x_len, hidden*2)

        return torch.cat((h, c2q_att, h * c2q_att, h * q2c_att), dim=-1)

    def forward(self, batch):
        # Character Embedding Layer
        x_char_emb = self.char_emb(batch.x_char)
        q_char_emb = self.char_emb(batch.q_char)

        # Word Embedding Layer
        x_word_emb = self.word_emb(batch.x_word[0])
        q_word_emb = self.word_emb(batch.q_word[0])
        x_lens = batch.x_word[1]
        q_lens = batch.q_word[1]
        
        x = self.highway(x_char_emb, x_word_emb)
        q = self.highway(q_char_emb, q_word_emb)

        # Contextual Embedding Layer
        h = self.contextual_emb(x, x_lens)
        u = self.contextual_emb(q, q_lens)

        # Attention Flow Layer
        g = self.bidaf(h, u)

        # Modeling Layer
        m = self.modeling_lstm_1(g, x_lens)
        m = self.modeling_lstm_2(m, x_lens)

        # Output Layer
        p1 = (self.wp1_g(g) + self.wp1_m(m)).squeeze()
        p1 = F.softmax(p1, dim=-1)
        m2 = self.output_lstm(m, x_lens)
        p2 = (self.wp2_g(g) + self.wp2_m(m2)).squeeze()
        p2 = F.softmax(p2, dim=-1)
        return p1, p2

In [36]:
device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
model = BiDAF(pretrain_embedding=squad_data.WORD.vocab.vectors,
              char_vocab_size=len(squad_data.CHAR_NESTING.vocab)).to(device)

# Train

In [37]:
from torch import optim
from tensorboardX import SummaryWriter

optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=0.2)
criterion = nn.CrossEntropyLoss()
model.train()
writer = SummaryWriter('log')



In [None]:
iter = 0
for i_epoch in range(12):
    print(f'Epoch {i_epoch}')
    squad_data.train_iter.init_epoch()
    epoch_loss = 0.0

    cur_batch_loss = 0.0
    for i, batch in enumerate(squad_data.train_iter):
        p1, p2 = model(batch)
        loss = criterion(p1, batch.p_begin) + criterion(p2, batch.p_end)
        cur_batch_loss += loss.item()
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        iter += 1

        if iter % 100:
            writer.add_scalar('Train/Loss', cur_batch_loss, iter // 100)
            cur_batch_loss = 0.0

    print(f"Total epoch loss {epoch_loss}")

Epoch 0
