In [1]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from transformers import BertTokenizerFast, BertForQuestionAnswering

# there are pre fine-tuned models that learned from the SQuAD dataset

bert_tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased', return_token_type_ids=True)
qa_bert = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# bert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')  # distilbert doesn't have token type IDs
# qa_bert = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')


INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
qa_bert

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [3]:
from datasets import Dataset, load_dataset
import pandas as pd

q, c, s, e, f_a = [], [], [], [], []

def get_sub_list_positions(context_encoded, answer_encoded):
    for idx in range(len(context_encoded) - len(answer_encoded) + 1):
        if context_encoded[idx : idx + len(answer_encoded)] == answer_encoded:
            return idx, idx + len(answer_encoded) - 1
    return None, None
    
for example in load_dataset('adversarial_qa', 'adversarialQA', split='train'):
    context_encoded = bert_tokenizer.encode(example['context'])
    for answer, answer_start in zip(example['answers']['text'], example['answers']['answer_start']):
        answer_encoded = bert_tokenizer.encode(answer, add_special_tokens=False)  # ignore the CLS and SEP token
        start_pos, end_pos = get_sub_list_positions(context_encoded, answer_encoded)
        if start_pos:
            q.append(example['question'])
            c.append(example['context'])
            s.append(start_pos)
            f_a.append(answer)
            e.append(end_pos)
            break
    
qa_df = pd.DataFrame({
    'question': q, 'context': c, 'start_positions': s, 'end_positions': e, 'answer':  f_a
})

Reusing dataset adversarial_qa (/Users/sinanozdemir/.cache/huggingface/datasets/adversarial_qa/adversarialQA/1.0.0/92356be07b087c5c6a543138757828b8d61ca34de8a87807d40bbc0e6c68f04b)
Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


In [4]:
qa_df.shape

(29989, 5)

In [5]:
example

{'id': '51fc3ffa78a1eb0d126ad19f48d84c4c8e8220d8',
 'title': 'Film_speed',
 'context': 'Some high-speed black-and-white films, such as Ilford Delta 3200 and Kodak T-MAX P3200, are marketed with film speeds in excess of their true ISO speed as determined using the ISO testing method. For example, the Ilford product is actually an ISO 1000 film, according to its data sheet. The manufacturers do not indicate that the 3200 number is an ISO rating on their packaging. Kodak and Fuji also marketed E6 films designed for pushing (hence the "P" prefix), such as Ektachrome P800/1600 and Fujichrome P1600, both with a base speed of ISO 400.',
 'question': 'What letter designates what Ektachrome is designed for?',
 'answers': {'text': ['P'], 'answer_start': [450]},
 'metadata': {'split': 'train', 'model_in_the_loop': 'Combined'}}

In [6]:
qa_dataset = Dataset.from_pandas(qa_df.sample(500, random_state=42))

# Dataset has a built in train test split method
qa_dataset = qa_dataset.train_test_split(test_size=0.2)

qa_train = qa_dataset['train']
qa_test = qa_dataset['test']


qa_train[0]

{'question': 'Which western N.C. settler was second mentioned?',
 'context': 'Differences in the settlement patterns of eastern and western North Carolina, or the Low Country and uplands, affected the political, economic, and social life of the state from the 18th until the 20th century. The Tidewater in eastern North Carolina was settled chiefly by immigrants from rural England and the Scottish Highlands. The upcountry of western North Carolina was settled chiefly by Scots-Irish, English, and German Protestants, the so-called "cohee". Arriving during the mid- to late 18th century, the Scots-Irish from what is today Northern Ireland were the largest non-English immigrant group before the Revolution; English indentured servants were overwhelmingly the largest immigrant group before the Revolution. During the American Revolutionary War, the English and Highland Scots of eastern North Carolina tended to remain loyal to the British Crown, because of longstanding business and personal conne

In [7]:
# We will pad our dataset so that our input matrices are the same length and truncate anything longer than 512 tokens
def preprocess(data):
    return bert_tokenizer(data['question'], data['context'], padding='max_length', truncation=True)

qa_train = qa_train.map(preprocess, batched=True, batch_size=512)
qa_test = qa_test.map(preprocess, batched=True, batch_size=512)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [33]:
qa_train[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0

In [None]:
# We use the token_type_ids to differentiate between sentence A and sentence B

qa_train.set_format(
    'torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
)
qa_test.set_format(
    'torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
)

In [8]:
list(qa_bert.bert.named_parameters())[357]

('encoder.layer.22.attention.self.query.weight',
 Parameter containing:
 tensor([[ 0.0185,  0.0095, -0.0095,  ..., -0.0581,  0.0758,  0.0024],
         [ 0.0438,  0.0317,  0.0085,  ..., -0.0350, -0.0656,  0.0154],
         [-0.0217, -0.0324, -0.0038,  ...,  0.0086,  0.0222, -0.0580],
         ...,
         [-0.0281,  0.0188, -0.0022,  ..., -0.0140, -0.0115, -0.0380],
         [ 0.0588,  0.0527, -0.0517,  ...,  0.0172, -0.0193, -0.0368],
         [ 0.0208, -0.0004,  0.0406,  ...,  0.0185, -0.0111,  0.0547]],
        requires_grad=True))

In [9]:
# freeze all but the last 2 encoder layers in BERT to speed up training
for param in list(qa_bert.bert.parameters())[:357]:
    param.requires_grad = False  # disable training in BERT

In [10]:
from transformers import TrainingArguments, Trainer

batch_size = 32
epochs = 2

training_args = TrainingArguments(
    output_dir='./qa/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_dir='./qa/logs',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=1,
    logging_first_step=True
)

trainer = Trainer(
    model=qa_bert,
    args=training_args,
    train_dataset=qa_train,
    eval_dataset=qa_test
)

# Get initial metrics
trainer.evaluate()

{'eval_loss': 10.091727256774902,
 'eval_runtime': 420.5896,
 'eval_samples_per_second': 0.238,
 'init_mem_cpu_alloc_delta': 1732608,
 'init_mem_cpu_peaked_delta': 16384,
 'eval_mem_cpu_alloc_delta': 131903488,
 'eval_mem_cpu_peaked_delta': 1425113088}

In [11]:
trainer.train()

Step,Training Loss
1,9.7317
2,8.2808
3,8.7218
4,7.1253
5,6.6936
6,6.7143
7,6.3916
8,6.1216
9,5.5144
10,5.5333


TrainOutput(global_step=26, training_loss=5.955727558869582, metrics={'train_runtime': 4114.4155, 'train_samples_per_second': 0.006, 'total_flos': 0, 'epoch': 2.0, 'train_mem_cpu_alloc_delta': 501358592, 'train_mem_cpu_peaked_delta': 6627184640})

In [12]:
trainer.evaluate()

{'eval_loss': 5.133622646331787,
 'eval_runtime': 388.6363,
 'eval_samples_per_second': 0.257,
 'epoch': 2.0,
 'eval_mem_cpu_alloc_delta': -8876032,
 'eval_mem_cpu_peaked_delta': 1421017088}

In [13]:
qa_bert.save_pretrained(
    './qa/results', 
    push_to_hub=False, repo_name="finetuned-qa-model"
)

In [34]:
qa_df.iloc[100]

question           Who do they try to get to contribute to campai...
context            Provide Campaign Assistance. Minority leaders ...
start_positions                                                  112
end_positions                                                    113
answer                                                outside groups
Name: 100, dtype: object

In [36]:
response = qa_bert(bert_tokenizer.encode(qa_df.iloc[100].question, qa_df.iloc[100].context, return_tensors='pt'))
response.start_logits.argmax(), response.end_logits.argmax()

(tensor(150), tensor(151))

In [44]:
bert_tokenizer.decode(bert_tokenizer.encode(qa_df.iloc[100].context)[response.start_logits.argmax(): response.end_logits.argmax()])


'"'

In [18]:
response.start_logits.argmax(), response.end_logits.argmax()

(tensor(69), tensor(69))

In [None]:
# https://huggingface.co/docs/datasets/quicktour.html#fine-tuning-a-deep-learning-model