In [68]:
import json
import random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm

In [69]:
def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('input/squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('input/squad/dev-v2.0.json')

In [70]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [71]:
pretrained_dir = "pretrained/distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=512)
print(repr(tokenizer))

PreTrainedTokenizerFast(name_or_path='pretrained/distilbert-base-cased-distilled-squad', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [72]:
#train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
#val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
train_encodings = tokenizer(train_questions, train_contexts, truncation=True, padding=True)
val_encodings = tokenizer(val_questions, val_contexts, truncation=True, padding=True)

In [73]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [74]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [75]:
%%time
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_dir)
print(repr(model.config))

DistilBertConfig {
  "_name_or_path": "pretrained/distilbert-base-cased-distilled-squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.5.1",
  "vocab_size": 28996
}

CPU times: user 2.27 s, sys: 46.9 ms, total: 2.31 s
Wall time: 1.8 s


In [76]:
from torch.utils.data import DataLoader, Subset
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()
max_examples = 3 * 16
indices = range(len(train_dataset))
indices = random.sample(indices, max_examples)
sample_ds = Subset(train_dataset, indices)
train_loader = DataLoader(sample_ds, batch_size=16, shuffle=True)
#train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)
epochs = 2
for epoch in range(epochs):
    loss_mean = 0
    steps = len(train_loader)
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss_mean += loss / steps
        loss.backward()
        optim.step()
    print(f"epoch={epoch}, loss={loss_mean:.4f}")

100%|██████████| 3/3 [01:17<00:00, 25.87s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

epoch=0, loss=10.4121


100%|██████████| 3/3 [01:10<00:00, 23.63s/it]

epoch=1, loss=4.7230





In [77]:
model.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [78]:
model.save_pretrained("tmp")

In [79]:
%%time
model = AutoModelForQuestionAnswering.from_pretrained("tmp")
print(repr(model.config))

DistilBertConfig {
  "_name_or_path": "tmp",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.5.1",
  "vocab_size": 28996
}

CPU times: user 2.03 s, sys: 62.5 ms, total: 2.09 s
Wall time: 1.73 s


In [98]:
questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
    "What is Transformers previously known as?",
    "Who invented Transformers?",
    "when was Transformers invented?",
]
contexts = [r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""] * len(questions)
enc = tokenizer(questions, contexts, truncation=True, padding=True, return_tensors="pt")
print(repr(enc))

{'input_ids': tensor([[  101,  1731,  1242,  3073,  4487,  9044,  3584,  1132,  1907,  1107,
         25267,   136,   102,   100, 25267,   113,  3147,  1227,  1112,   185,
         25669,  1766,  1732,   118, 11303,  1468,  1105,   185, 25669,  1766,
          1732,   118,  3073,  4487,  9044,   118,  1129,  3740,   114,  2790,
          1704,   118,  3007,  4220,  1116,   113,   139,  9637,  1942,   117,
         15175,  1942,   118,   123,   117,   155,  1186, 27211, 10460,  1161,
           117,   161, 22074,   117, 12120,  2050,  2723,  2064,  7340,   117,
           161,  2162, 25264,   795,   114,  1111,  6240,  6828, 21300,   113,
         21239,  2591,   114,  1105,  6240,  6828, 10617,   113, 21239,  2349,
           114,  1114,  1166,  2724,   116,  3073,  4487,  9044,  3584,  1107,
          1620,   116,  3483,  1105,  1996,  9455, 19807,  6328,  1206,  5157,
         21484,  2271,  6737,   123,   119,   121,  1105,   153,  1183,  1942,
          1766,  1732,   119,   102],


In [99]:
outputs = model(**enc).values()
start_logits, end_logits = outputs
print(f"""len(outputs)={len(outputs)}, start_logits.size={start_logits.size()}
{repr(outputs)}
""")

len(outputs)=2, start_logits.size=torch.Size([6, 124])
odict_values([tensor([[ -3.6490,  -3.6246,  -6.2458,  -5.6943,  -8.1695,  -7.8217,  -5.5975,
          -7.1176,  -5.9592,  -7.1712,  -3.8118,  -4.3419,  -7.1055,  -2.8557,
          -3.3907,  -6.6875,  -6.4565,  -7.9877,  -8.9799,  -5.4271,  -8.4327,
          -9.4773,  -8.8143,  -9.0330,  -8.1665,  -8.5648,  -8.3357,  -5.4174,
          -9.0065,  -9.9294,  -8.7472,  -9.0862,  -7.5517,  -9.0491,  -8.9072,
          -9.0021,  -8.3361,  -8.7396,  -6.5901,  -6.0266,  -5.4052,  -7.8211,
          -7.3749,  -6.7588,  -7.3100,  -5.9891,  -5.1472,  -8.3681,  -8.1012,
          -7.6546,  -6.4833,  -8.7807,  -8.8381,  -7.3546,  -8.1153,  -6.9874,
          -8.8220,  -8.7417,  -8.9461,  -8.4285,  -8.0074,  -7.0086,  -8.7146,
          -8.1232,  -7.0831,  -9.4768,  -8.7962,  -8.7904,  -8.6294,  -8.2740,
          -6.6604,  -8.7491,  -8.2894,  -6.4729,  -6.3459,  -6.4018,  -4.8097,
          -7.0090,  -6.5348,  -6.0109,  -4.4547,  -7.0041,  -6

In [100]:
input_ids = enc["input_ids"]
for i in range(len(input_ids)):
    j = torch.argmax(start_logits[i])  
    k = torch.argmax(end_logits[i]) + 1
    tokens = tokenizer.convert_ids_to_tokens(input_ids[i][j:k])
    a = tokenizer.convert_tokens_to_string(tokens)
    print(f"{questions[i]}\nj={j}, k={k}, a={a}")

How many pretrained models are available in Transformers?
j=92, k=99, a=over 32 + pretrained models
What does Transformers provide?
j=34, k=39, a=general - purpose architectures
Transformers provides interoperability between which frameworks?
j=108, k=121, a=TensorFlow 2. 0 and PyTorch
What is Transformers previously known as?
j=15, k=34, a=pytorch - transformers and pytorch - pretrained - bert
Who invented Transformers?
j=3, k=4, a=Transformers
when was Transformers invented?
j=7, k=1, a=
