In [23]:
import os
import numpy as np
import matplotlib.pyplot as plt

import torch

import transformers
from transformers import AutoTokenizer, BertForQuestionAnswering
from datasets import load_dataset

In [24]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Device: {}'.format(device))

dataset = load_dataset('adversarial_qa', 'adversarialQA')
dataset['train'][42]
print(dataset['train'])
print(dataset['validation'])
print(dataset['test'])

Device: cuda:0


Reusing dataset adversarial_qa (C:\Users\water\.cache\huggingface\datasets\adversarial_qa\adversarialQA\1.0.0\92356be07b087c5c6a543138757828b8d61ca34de8a87807d40bbc0e6c68f04b)


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
    num_rows: 30000
})
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
    num_rows: 3000
})
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
    num_rows: 3000
})


In [22]:
#HuggingFace input preprocessing
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess(samples):
    questions = [q.strip() for q in samples['question']]
    inputs = tokenizer(
        questions,
        samples['context'],
        max_length = 384,
        truncation = 'only_second',
        return_offsets_mapping=True,
        padding='max_length'
    )
    
    offset_mapping = inputs.pop('offset_mapping')
    answers = samples['answers']
    start_positions = []
    end_positions = []
    
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_train = dataset['train'].map(preprocess, batched=True, remove_columns=dataset["train"].column_names)
tokenized_validation = dataset['validation'].map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [25]:
data_collator = transformers.DefaultDataCollator()

In [None]:
model = transformers.AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')
model.to(device)

epochs = 10
batch_size = 16
eval_batch_size = 16

train_data_size = len(dataset['train'])
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)
initial_learning_rate = 1e-5

training_args = transformers.TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=initial_learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01
    )
    
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator
    )

trainer.train()

In [34]:
test_sample = dataset['test'][43]
test_context = test_sample['context']
test_question = test_sample['question']

question, text = test_question, test_context
inputs = tokenizer(question, text, return_tensors='pt')
inputs.to(device)

with torch.no_grad():
    outputs = model(**inputs)
    
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index:answer_end_index+1]

print(test_context)
print('')
print(test_question)
print('')
print(tokenizer.decode(predict_answer_tokens))

One of its earliest massive implementations was brought about by Egyptians against the British occupation in the 1919 Revolution. Civil disobedience is one of the many ways people have rebelled against what they deem to be unfair laws. It has been used in many nonviolent resistance movements in India (Gandhi's campaigns for independence from the British Empire), in Czechoslovakia's Velvet Revolution and in East Germany to oust their communist governments, In South Africa in the fight against apartheid, in the American Civil Rights Movement, in the Singing Revolution to bring independence to the Baltic countries from the Soviet Union, recently with the 2003 Rose Revolution in Georgia and the 2004 Orange Revolution in Ukraine, among other various movements worldwide.

What fabric shares its name with a fight?

velvet revolution
