In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer

model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
from datasets import load_dataset

dataset = load_dataset("squad", split="train")

In [5]:
def filter_function(examples):
    question = examples["question"]
    context = examples["context"]
    input_len = len(tokenizer(question,context, truncation=False).input_ids)
    return (input_len < tokenizer.model_max_length)

dataset = dataset.filter(filter_function)

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [10]:
dataset[0]["answers"]['text'][0]

'Saint Bernadette Soubirous'

In [11]:
def preprocess_squad_batch(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answers"]

    def generate_input(_question, _context):
        return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])

    inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
    targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
    return inputs, targets

def preprocess_function(examples):
    inputs, targets = preprocess_squad_batch(examples)
    model_inputs = tokenizer(inputs)
    labels = tokenizer(text_target=targets, max_length=30, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
train_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset.column_names,
)

Map (num_proc=4):   0%|          | 0/87310 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
metric = evaluate.load("squad_v2" if data_args.version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

# Post-processing:
def post_processing_function(
    examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval"
):
    # Decode the predicted tokens.
    preds = outputs.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    # Replace -100s used for padding as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
    predictions = {}
    # Let's loop over all the examples!
    for example_index, example in enumerate(examples):
        # This is the index of the feature associated to the current example.
        feature_index = feature_per_example[example_index]
        predictions[example["id"]] = decoded_preds[feature_index]

    # Format the result to the format the metric expects.
    if data_args.version_2_with_negative:
        formatted_predictions = [
            {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
        ]
    else:
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

    references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [None]:
save_path: str="/nfs/turbo/umms-vgvinodv/models/finetuned-checkpoints/nlp-gen/qa"
name = model_name.split("/")[-1]
save_path = f"{save_path}/{name}-squad"
batch_size = 32
num_train_epochs = 1

training_args = Seq2SeqTrainingArguments(
    output_dir=save_path,
    evaluation_strategy="no",
    save_strategy = "epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    fp16=True,
    #push_to_hub=True,
    overwrite_output_dir=True,
)


trainer = QuestionAnsweringSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
    post_process_function=post_processing_function,
)