In [1]:
! pip install datasets transformers
! pip install accelerate cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# ! pip install git+https://github.com/huggingface/accelerate

Collecting datasets
  Downloading datasets-1.9.0-py3-none-any.whl (262 kB)
[K     |████████████████████████████████| 262 kB 5.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 9.6 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 34.1 MB/s 
Collecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.14-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.5 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 45.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.7 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.

In [2]:
import pandas as pd
import numpy as np

from datasets import (
    load_dataset, load_metric, Dataset,
    ClassLabel, Sequence
)

import json
import torch
from torch.utils.data import DataLoader
import collections
# from accelerate import Accelerator, DistributedType

from transformers import (
     AutoConfig,
     AutoModelForQuestionAnswering, 
     TrainingArguments, 
     Trainer, 
     AutoTokenizer, 
     default_data_collator,
     set_seed
)
from tqdm.auto import tqdm




In [3]:
# squad = load_dataset('squad_v2')


In [3]:
def read_quac_data(file_location):
    with open(file_location + '/quac_val.json') as f1, open(file_location + '/quac_train.json') as f2:
      quac_val = json.load(f1)
      quac_train = json.load(f2)

      quac_val = Dataset.from_dict(quac_val)
      quac_train = Dataset.from_dict(quac_train)

      return quac_train, quac_val

In [4]:
path = '/content/drive/MyDrive/thesis/data/quac_as_squad'
quac_train, quac_val = read_quac_data(path)


In [6]:
print(quac_train)
print(quac_val)
exs = quac_train[:2]
for k, v in exs.items():
  print(k, len(v), v)

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 83568
})
Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 7354
})
id 2 ['C_69758fcdfc1f46baba0e92c0f3b0919c_1_q#0', 'C_69758fcdfc1f46baba0e92c0f3b0919c_1_q#1']
context 2 ['According to the Indian census of 2001, there were 30,803,747 speakers of Malayalam in Kerala, making up 93.2% of the total number of Malayalam speakers in India, and 96.7% of the total population of the state. There were a further 701,673 (2.1% of the total number) in Karnataka, 557,705 (1.7%) in Tamil Nadu and 406,358 (1.2%) in Maharashtra. The number of Malayalam speakers in Lakshadweep is 51,100, which is only 0.15% of the total number, but is as much as about 84% of the population of Lakshadweep. In all, Malayalis made up 3.22% of the total Indian population in 2001. Of the total 33,066,392 Malayalam speakers in India in 2001, 33,015,420 spoke the standard dialects, 19,643 spoke the Yerava dialect and 3

In [7]:
tokenizer = AutoTokenizer.from_pretrained('drive/MyDrive/thesis/weights/quac-trained/distilbert')
# config = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased', output_hidden_states=True) 
config = AutoConfig.from_pretrained('drive/MyDrive/thesis/weights/quac-trained/distilbert', output_hidden_states = True, output_attentions=True)
model = AutoModelForQuestionAnswering.from_config(config)


In [8]:
max_length = 384
doc_stride = 128

In [9]:
# example_list = []
# for i, example in enumerate(squad['train']):
#     if len(tokenizer(example['context'])['input_ids']) > 384:
#         break

# example = squad['train'][i]
# print(len(example_list), example_list)
# print(example)
# print(len(example))

In [10]:
# tokenized_example = tokenizer(
#     example['question'],
#     example['context'],
#     max_length = max_length,
#     truncation = 'only_second',
#     return_overflowing_tokens = True,
#     return_offsets_mapping = True,
#     stride = doc_stride
# )


# for key in tokenized_example.keys():
#     print(key, ':', tokenized_example[str(key)])

# sequence_id = tokenized_example.sequence_ids(1)
# print(len(sequence_id), sequence_id)


We use offsets mapping to find the start and end character index of our answer span. Since answer lies in the context, we need to know from where the context starts. This is done using sequence_ids.

In [11]:
# print(offsets[start_token_index], offsets[end_token_index])

In [13]:
pad_on_right = tokenizer.padding_side == "right"

In [21]:
def prepare_train_features(examples):
   ##### Tokenization #####
    tokenized_examples = tokenizer(
        examples['question' if pad_on_right else 'context'],
        examples['context' if pad_on_right else 'question'],
        truncation = 'only_second' if pad_on_right else 'only_first',
        max_length = max_length,
        stride = doc_stride,
        return_overflowing_tokens = True,
        return_offsets_mapping = True,
        padding = 'max_length'
    )

    # Since one example might give us several features if it has a long context, we need to map from a feature to 
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop('overflow_to_sample_mapping')
    # The offset mapping will give us a map from a token to its start and end character positions in the original context.
    # This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop('offset_mapping')

    # let's label those positions
    tokenized_examples['start_positions'] = []
    tokenized_examples['end_positions'] = []
    # print(len(offset_mapping))
    
    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples['answers'][sample_index]

        #### check if this question has an answer #####
        # If no answers are given, set the cls_index as answer.
        if len(answers['answer_start']) == 0:
            tokenized_examples['start_positions'].append(cls_index)
            tokenized_examples['end_positions'].append(cls_index)
        else:
            # Start and end character index of the answer in the text
            start_char = answers['answer_start'][0]
            end_char = start_char + len(answers['text'][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1


            ##### checking that since this question has an answer, does this span have the answer #####
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

# New Section

In [9]:
squad = quac_train
squad_val = quac_val
print(squad)

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 83568
})


In [22]:
#remove_columns = squad['train'].column_names when using actual squad_dataset, other wise just squad.column_names
tokenized_squad = squad.map(prepare_train_features, batched=True, remove_columns = squad.column_names)


HBox(children=(FloatProgress(value=0.0, max=84.0), HTML(value='')))




In [24]:
tokenized_squad_val = quac_val.map(prepare_train_features, batched=True, remove_columns = quac_val.column_names)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [17]:
print(len(tokenized_squad), len(tokenized_squad_val))
num_epochs = 3
training_steps = num_epochs * len(tokenized_squad)
print(training_steps)
print(training_steps/24)

175024 17281
525072
21878.0


In [16]:
args = TrainingArguments(
    f"/content/test-quac",
    overwrite_output_dir = True,
    # evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    
    gradient_accumulation_steps = 24,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy = 'epoch',
    save_total_limit = 1,
    seed = 21,
    # tpu_num_cores=8,
    # dataloader_drop_last=True
)

In [25]:
trainer = Trainer(
    model,
    args,
    train_dataset = tokenized_squad,
    eval_dataset = tokenized_squad_val,
    # data_collator=data_collator,
    tokenizer=tokenizer,

)

In [20]:
trainer.train()

***** Running training *****
  Num examples = 175024
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 384
  Gradient Accumulation steps = 24
  Total optimization steps = 1365


Step,Training Loss
500,2.896
1000,2.7778


Saving model checkpoint to /content/test-quac/checkpoint-455
Configuration saved in /content/test-quac/checkpoint-455/config.json
Model weights saved in /content/test-quac/checkpoint-455/pytorch_model.bin
tokenizer config file saved in /content/test-quac/checkpoint-455/tokenizer_config.json
Special tokens file saved in /content/test-quac/checkpoint-455/special_tokens_map.json
Saving model checkpoint to /content/test-quac/checkpoint-910
Configuration saved in /content/test-quac/checkpoint-910/config.json
Model weights saved in /content/test-quac/checkpoint-910/pytorch_model.bin
tokenizer config file saved in /content/test-quac/checkpoint-910/tokenizer_config.json
Special tokens file saved in /content/test-quac/checkpoint-910/special_tokens_map.json
Deleting older checkpoint [/content/test-quac/checkpoint-455] due to args.save_total_limit
Saving model checkpoint to /content/test-quac/checkpoint-1365
Configuration saved in /content/test-quac/checkpoint-1365/config.json
Model weights saved

TrainOutput(global_step=1365, training_loss=2.8114748329470007, metrics={'train_runtime': 5788.4649, 'train_samples_per_second': 90.71, 'train_steps_per_second': 0.236, 'total_flos': 8.02389263731753e+16, 'train_loss': 2.8114748329470007, 'epoch': 3.0})

In [47]:
trainer.save_model("drive/MyDrive/thesis/weights/quac-trained/distilbert")

Saving model checkpoint to drive/MyDrive/thesis/weights/quac-trained/distilbert
Configuration saved in drive/MyDrive/thesis/weights/quac-trained/distilbert/config.json
Model weights saved in drive/MyDrive/thesis/weights/quac-trained/distilbert/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/thesis/weights/quac-trained/distilbert/tokenizer_config.json
Special tokens file saved in drive/MyDrive/thesis/weights/quac-trained/distilbert/special_tokens_map.json


In [31]:


for batch in trainer.get_eval_dataloader():
    break
batch = {k:v.to(trainer.args.device) for k,v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

odict_keys(['loss', 'start_logits', 'end_logits', 'hidden_states', 'attentions'])

In [23]:
print(len(output))
hidden_states= output[3]

5


In [None]:
output.start_logits.shape, output.end_logits.shape

In [None]:
model.config.output_hidden_states

In [None]:
print(output['start_logits'])

print(output['end_logits'])

In [None]:
output.start_logits.argmax(dim=-1), output.end_logits.argmax(dim=-1)

Score obtained by adding start and end logits will be used to detect the most suitable answer. We try to find the top 20 probable answers and then pick the best one.

In [10]:
n_best_size = 15

In [None]:
# import numpy as np

# start_logits = output.start_logits[0].cpu().numpy()
# end_logits = output.end_logits[0].cpu().numpy()
# # print(len(start_logits))
# #top 20 start indexes
# start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1: -1].tolist()
# end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1: -1].tolist()
# print(start_indexes)
# print(end_indexes)

In [None]:
# valid_answers = []
# for start_index in start_indexes:
#     for end_index in end_indexes:
#         if start_index <= end_index:
#             valid_answers.append(
#                 {
#                     "score": start_logits[start_index] + end_logits[end_index],
#                     "text" : ""
#                 }
#             )

In [11]:
def prepare_validation_features(examples):
  tokenized_examples = tokenizer(
      examples['question' if pad_on_right else 'context'],
      examples['context' if pad_on_right else 'question'],
      truncation = 'only_second' if pad_on_right else 'only_first',
      max_length = max_length,
      stride = doc_stride,
      return_overflowing_tokens = True,
      return_offsets_mapping = True,
      padding = "max_length"
  )
 
  # one example can give several several features, we need a map from a feature to its corresponding example.
  # overflow_to_sample_mapping gives us just that.
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

  # we keep the sample id that gave us this feature and we will store the offset mappings
  tokenized_examples['example_id'] = []
 
  # print(len(s))
  for i in range(len(tokenized_examples['input_ids'])):
      sequence_ids = tokenized_examples.sequence_ids(i)
      # print(sequence_ids)
      context_index = 1 if pad_on_right else 0

      # one example can give several spans, this is the index of the example containing this span of text.
      sample_index = sample_mapping[i]
      
      tokenized_examples['example_id'].append(examples['id'][sample_index])
      
      # set to None the offset_mapping that are not part of the context so it's easy to determine if a token
      # position is part of the context or not.
      # print(len(tokenized_examples['offset_mapping'][i]), len(sequence_ids))
      # print('tokenized_examples:', tokenized_examples['offset_mapping'][i])
      for k in range(len(sequence_ids)):
          if sequence_ids[k] == context_index:
              break
          else:
            tokenized_examples['offset_mapping'][i][k] = None
      # tokenized_examples['offset_mapping'][i] = [
      #     (o if sequence_ids[k] == context_index else None) for k,o in enumerate(tokenized_examples['offset_mapping'][i])                                         
      # ]
      tokenized_examples['offset_mapping'][i][-1] = None
      # print('tokenized_examples:',tokenized_examples['offset_mapping'][i])
      # print('------')
  return tokenized_examples


In [14]:
validation_features = squad_val.map(
    prepare_validation_features,
    batched=True,
    remove_columns = squad_val.column_names
)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [26]:
raw_predictions = trainer.predict(validation_features, ignore_keys=['hidden_states', 'attentions'])

The following columns in the test set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Prediction *****
  Num examples = 17281
  Batch size = 16


In [27]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))
print(validation_features)

Dataset({
    features: ['attention_mask', 'example_id', 'input_ids', 'offset_mapping'],
    num_rows: 17281
})


In [51]:
print(raw_predictions[0][0][0].shape)

(384,)


In [40]:
max_answer_length = 100

In [52]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
offset_mapping = validation_features[0]["offset_mapping"]
# The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# an example index
# context = squad["validation"][0]["context"]
context = squad_val[0]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []

for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
        # to part of the input_ids that are not in the context.
        if (
            start_index >= len(offset_mapping)
            or end_index >= len(offset_mapping)
            or offset_mapping[start_index] is None
            or offset_mapping[end_index] is None
        ):
            continue
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
valid_answers

[{'score': 2.168593,
  'text': 'her debut as a radio producer on ANT1 Radio. Her radio program was titled after one of her songs Ta Koritsia Einai Atakt'},
 {'score': 2.0317109,
  'text': 'velas. Since 1975, all her releases have become gold or platinum and have included songs by Karvelas. In 1986, she participated at the Cypriot National Final for Eurovision Song Contest with the song Thelo Na Gino Star ("I Want To Be A Star"), taking second place. This song is still unreleased up to date. In 1984, Vissi left her record company'},
 {'score': 2.0161757, 'text': 'following year her seventh album Kati Simveni'},
 {'score': 2.005964,
  'text': 'velas. Since 1975, all her releases have become gold or platinum and have included songs by Karvelas. In 1986, she participated at the Cypriot National Final for Eurovision Song Contest with the song Thelo Na Gino Star ("I Want To Be A Star'},
 {'score': 1.9234507,
  'text': 'You Had a Heart"). The album was certified gold. The following year her s

In [30]:
squad_v2 = True

In [33]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):

    all_start_logits, all_end_logits = raw_predictions
    #building a map example to its corresponding features
    example_id_to_index = {k:i for i, k in enumerate(examples['id'])}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature['example_id']]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):

        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []

        context = example['context']

         # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
    
        answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
        predictions[example["id"]] = answer

    return predictions


In [34]:
final_predictions = postprocess_qa_predictions(squad_val, validation_features, raw_predictions.predictions)

Post processing 7354 example predictions split into 17281 features.


HBox(children=(FloatProgress(value=0.0, max=7354.0), HTML(value='')))




In [35]:
metric = load_metric("squad_v2")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2264.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3182.0, style=ProgressStyle(description…




In [41]:
# metric = load_metric("squad_v2" if squad_v2 else "squad")
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]

In [42]:
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in squad_val]
metric.compute(predictions=formatted_predictions, references=references)

{'HasAns_exact': 0.0,
 'HasAns_f1': 9.392240400703653,
 'HasAns_total': 5868,
 'NoAns_exact': 1.009421265141319,
 'NoAns_f1': 1.009421265141319,
 'NoAns_total': 1486,
 'best_exact': 20.20669023660593,
 'best_exact_thresh': 0.0,
 'best_f1': 20.2082900062391,
 'best_f1_thresh': 0.0,
 'exact': 0.20397062822953493,
 'f1': 7.698350104885649,
 'total': 7354}

In [43]:
print(formatted_predictions)




In [44]:
print(references)

