In [2]:
import transformers
from transformers import PreTrainedTokenizerFast
from datasets import concatenate_datasets
from datasets import load_from_disk, Dataset
from transformers import AutoTokenizer
from utils_qa import postprocess_qa_predictions
from trainer_qa import QuestionAnsweringTrainer
from utils_data import read_annotation_gzip

max_length = 512
doc_stride = 128
batch_size = 1
max_val_samples= 5
dir_name = 'roberta'
model_pretrain = 'roberta-large'   # "bert-large-uncased" 
tokenizer = AutoTokenizer.from_pretrained(
    model_pretrain,
    use_fast=True,
)
pad_on_right = tokenizer.padding_side == "right"
train_dataset=load_from_disk("/storage/{}/train_{}_{}".format(dir_name, 20000,15950)).shuffle()
'''
eval_examples = load_from_disk("/storage/{}/val_example".format(dir_name))
eval_examples = eval_examples.select(range(max_val_samples))
eval_dataset = eval_examples.map(prepare_validation_features, batched=True, remove_columns=eval_examples.column_names)
#eval_dataset = load_from_disk("/storage/{}/val".format(dir_name))
'''




Loading cached shuffled indices for dataset at /storage/roberta/train_20000_15950/cache-761a4a9ab4960eef.arrow


'\neval_examples = load_from_disk("/storage/{}/val_example".format(dir_name))\neval_examples = eval_examples.select(range(max_val_samples))\neval_dataset = eval_examples.map(prepare_validation_features, batched=True, remove_columns=eval_examples.column_names)\n#eval_dataset = load_from_disk("/storage/{}/val".format(dir_name))\n'

In [3]:
from utils_data import read_annotation_gzip
from datasets import Dataset
path = '/storage/datset/v1.0_sample_nq-dev-sample.jsonl.gz'
dic = read_annotation_gzip(path)
eval_examples = Dataset.from_dict(dic).select(range(max_val_samples))
eval_dataset = eval_examples.map(prepare_validation_features, batched=True, remove_columns=eval_examples.column_names)


from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, EvalPrediction
from transformers import default_data_collator
# Post-processing:
def post_processing_function(examples, features, predictions, stage="eval"):
    # Post-processing: we match the start logits and end logits to answers in the original context.
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        n_best_size=20,
        output_dir= args.output_dir,
        is_world_process_zero=trainer.is_world_process_zero(),
        prefix=stage,
    )
    # Format the result to the format the metric expects.
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    #references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]

    references = [{"id": ex['id'], 'start_token': ex['annotations'][0]['long_answer']['start_token'], 
                   'end_token': ex['annotations'][0]['long_answer']['end_token']
      } for ex in examples ]

    return EvalPrediction(predictions=formatted_predictions, label_ids=references)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [4]:
model = AutoModelForQuestionAnswering.from_pretrained(model_pretrain)
args = TrainingArguments(
    "/storage/model/{}".format(dir_name),
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*16,
    num_train_epochs=1,
    save_steps = 300,
    eval_steps = 10,
    evaluation_strategy ='steps',
    gradient_accumulation_steps=8,
)

data_collator = default_data_collator

# Initialize our Trainer
trainer = QuestionAnsweringTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    eval_examples=eval_examples,
    tokenizer=tokenizer,
    data_collator=data_collator,
    post_process_function=post_processing_function,
)
# TODO:  compute_metrics=compute_metrics,

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to us

In [5]:
trainer.train()

Step,Training Loss,Validation Loss
10,No log,4.732738


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Saving predictions to /storage/model/roberta/eval_predictions.json.


KeyboardInterrupt: 

In [4]:
# OLD VERSION Validation preprocessing
def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    #examples = simplify_nq_example(examples)
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
    # corresponding example_id and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)           
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [1]:
# Validation preprocessing
def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    #examples = simplify_nq_example(examples)
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    offset_mapping = tokenized_examples["offset_mapping"]

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
    # corresponding example_id and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)           
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

        ## TODO for gz file
        start_char, end_char = -1, -1
        # Start/end character index of the answer in the text.
        for candidate in examples["annotations"][sample_index]:
            cur = candidate["long_answer"]["start_char"]
            if cur != -1:
                start_char = cur
                end_char = candidate["long_answer"]["end_char"]
        # If no answers are given, set the cls_index as answer.
        if start_char == -1:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
                
    return tokenized_examples
