In [1]:
from transformers import BertTokenizerFast, BertForQuestionAnswering

# there are pre fine-tuned models that learned from the SQuAD dataset

bert_tokenizer = BertTokenizerFast.from_pretrained(
    'bert-large-uncased-whole-word-masking-finetuned-squad', return_token_type_ids=True
)
qa_bert = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [2]:
qa_bert

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [3]:
from datasets import Dataset, load_dataset
import pandas as pd

q, c, s, e, f_a = [], [], [], [], []

def get_sub_list_positions(context_encoded, answer_encoded):
    for idx in range(len(context_encoded) - len(answer_encoded) + 1):
        if context_encoded[idx : idx + len(answer_encoded)] == answer_encoded:
            return idx, idx + len(answer_encoded) - 1
    return None, None
    
for example in load_dataset('adversarial_qa', 'adversarialQA', split='train'):
    context_encoded = bert_tokenizer.encode(example['question'], example['context'])
    for answer, answer_start in zip(example['answers']['text'], example['answers']['answer_start']):
        answer_encoded = bert_tokenizer.encode(answer, add_special_tokens=False)  # ignore the CLS and SEP token
        start_pos, end_pos = get_sub_list_positions(context_encoded, answer_encoded)
        if start_pos:
            q.append(example['question'])
            c.append(example['context'])
            s.append(start_pos)  # TODO does this need to include the question??
            e.append(end_pos)# TODO does this need to include the question??
            f_a.append(answer)
            break
    
qa_df = pd.DataFrame({
    'question': q, 'context': c, 'start_positions': s, 'end_positions': e, 'answer':  f_a
})

Reusing dataset adversarial_qa (/Users/sinanozdemir/.cache/huggingface/datasets/adversarial_qa/adversarialQA/1.0.0/92356be07b087c5c6a543138757828b8d61ca34de8a87807d40bbc0e6c68f04b)
Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors


In [4]:
qa_df.shape

(29989, 5)

In [5]:
example

{'id': '51fc3ffa78a1eb0d126ad19f48d84c4c8e8220d8',
 'title': 'Film_speed',
 'context': 'Some high-speed black-and-white films, such as Ilford Delta 3200 and Kodak T-MAX P3200, are marketed with film speeds in excess of their true ISO speed as determined using the ISO testing method. For example, the Ilford product is actually an ISO 1000 film, according to its data sheet. The manufacturers do not indicate that the 3200 number is an ISO rating on their packaging. Kodak and Fuji also marketed E6 films designed for pushing (hence the "P" prefix), such as Ektachrome P800/1600 and Fujichrome P1600, both with a base speed of ISO 400.',
 'question': 'What letter designates what Ektachrome is designed for?',
 'answers': {'text': ['P'], 'answer_start': [450]},
 'metadata': {'split': 'train', 'model_in_the_loop': 'Combined'}}

In [6]:
qa_dataset = Dataset.from_pandas(qa_df.sample(2000, random_state=42))

# Dataset has a built in train test split method
qa_dataset = qa_dataset.train_test_split(test_size=0.2)

qa_train = qa_dataset['train']
qa_test = qa_dataset['test']


qa_train[0]

{'question': 'What are biblical books compared to?',
 'context': "Many ancient works, such as the Bible and the Greek tragedies,[citation needed] survive in hundreds of copies, and the relationship of each copy to the original may be unclear. Textual scholars have debated for centuries which sources are most closely derived from the original, hence which readings in those sources are correct.[citation needed] Although biblical books that are letters, like Greek plays, presumably had one original, the question of whether some biblical books, like the Gospels, ever had just one original has been discussed. Interest in applying textual criticism to the Qur'an has also developed after the discovery of the Sana'a manuscripts in 1972, which possibly date back to the 7–8th centuries.",
 'start_positions': 84,
 'end_positions': 85,
 'answer': 'Greek plays',
 '__index_level_0__': 2320}

In [7]:
# We will pad our dataset so that our input matrices are the same length and truncate anything longer than 512 tokens
def preprocess(data):
    return bert_tokenizer(data['question'], data['context'], padding='max_length', truncation=True)

qa_train = qa_train.map(preprocess, batched=True, batch_size=512)
qa_test = qa_test.map(preprocess, batched=True, batch_size=512)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
qa_train[0]

{'__index_level_0__': 2320,
 'answer': 'Greek plays',
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [9]:
# We use the token_type_ids to differentiate between sentence A and sentence B

qa_train.set_format(
    'torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
)
qa_test.set_format(
    'torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
)

In [10]:
list(qa_bert.bert.named_parameters())[357]

('encoder.layer.22.attention.self.query.weight',
 Parameter containing:
 tensor([[ 0.0185,  0.0095, -0.0095,  ..., -0.0581,  0.0758,  0.0024],
         [ 0.0438,  0.0317,  0.0085,  ..., -0.0350, -0.0656,  0.0154],
         [-0.0217, -0.0324, -0.0038,  ...,  0.0086,  0.0222, -0.0580],
         ...,
         [-0.0281,  0.0188, -0.0022,  ..., -0.0140, -0.0115, -0.0380],
         [ 0.0588,  0.0527, -0.0517,  ...,  0.0172, -0.0193, -0.0368],
         [ 0.0208, -0.0004,  0.0406,  ...,  0.0185, -0.0111,  0.0547]],
        requires_grad=True))

In [11]:
# freeze all but the last 2 encoder layers in BERT to speed up training
for param in list(qa_bert.bert.parameters())[:357]:
    param.requires_grad = False  # disable training in BERT

In [12]:
from transformers import TrainingArguments, Trainer

batch_size = 32
epochs = 2

training_args = TrainingArguments(
    output_dir='./qa/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_dir='./qa/logs',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=1,
    logging_first_step=True
)

trainer = Trainer(
    model=qa_bert,
    args=training_args,
    train_dataset=qa_train,
    eval_dataset=qa_test
)

# Get initial metrics
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: __index_level_0__, context, question, answer.
***** Running Evaluation *****
  Num examples = 400
  Batch size = 32


{'eval_loss': 3.6653783321380615,
 'eval_runtime': 221.2114,
 'eval_samples_per_second': 1.808,
 'eval_steps_per_second': 0.059}

In [48]:
example = qa_df.iloc[50]
encoded = bert_tokenizer.encode_plus(example.question, example.context, return_tensors='pt')

response = qa_bert(**encoded)
response.start_logits.argmax(), response.end_logits.argmax()

example

question           Where is the suprachiasmatic nucleus lococated...
context            The SCN projects to a set of areas in the hypo...
start_positions                                                   28
end_positions                                                     32
answer                                                  hypothalamus
Name: 50, dtype: object

In [49]:
bert_tokenizer.decode(encoded['input_ids'][0][response.start_logits.argmax(): response.end_logits.argmax()])


'hypothalamus, brainstem, and midbra'

In [20]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: __index_level_0__, context, question, answer.
***** Running training *****
  Num examples = 1600
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 100


Step,Training Loss
1,3.7115
2,3.7753
3,2.4804
4,3.0738
5,3.7672
6,2.8261
7,2.5261
8,2.6538
9,2.9576
10,2.9626


Saving model checkpoint to ./qa/results/checkpoint-50
Configuration saved in ./qa/results/checkpoint-50/config.json
Model weights saved in ./qa/results/checkpoint-50/pytorch_model.bin
Saving model checkpoint to ./qa/results/checkpoint-100
Configuration saved in ./qa/results/checkpoint-100/config.json
Model weights saved in ./qa/results/checkpoint-100/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=2.698631682395935, metrics={'train_runtime': 4335.5782, 'train_samples_per_second': 0.738, 'train_steps_per_second': 0.023, 'total_flos': 2971862374809600.0, 'train_loss': 2.698631682395935, 'epoch': 2.0})

In [21]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: __index_level_0__, context, question, answer.
***** Running Evaluation *****
  Num examples = 400
  Batch size = 32


{'eval_loss': 2.4868643283843994,
 'eval_runtime': 217.7224,
 'eval_samples_per_second': 1.837,
 'eval_steps_per_second': 0.06,
 'epoch': 2.0}

In [22]:
qa_bert.save_pretrained(
    './qa/results', 
    push_to_hub=False, repo_name="finetuned-qa-model"
)

Configuration saved in ./qa/results/config.json
Model weights saved in ./qa/results/pytorch_model.bin


In [50]:
example = qa_df.iloc[50]
encoded = bert_tokenizer.encode_plus(example.question, example.context, return_tensors='pt')

response = qa_bert(**encoded)
response.start_logits.argmax(), response.end_logits.argmax()

example

question           Where is the suprachiasmatic nucleus lococated...
context            The SCN projects to a set of areas in the hypo...
start_positions                                                   28
end_positions                                                     32
answer                                                  hypothalamus
Name: 50, dtype: object

In [51]:
bert_tokenizer.decode(encoded['input_ids'][0][response.start_logits.argmax(): response.end_logits.argmax()])


'hypothalamus, brainstem, and midbra'