In [1]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score, balanced_accuracy_score
from transformers import RobertaForSequenceClassification, RobertaModel, RobertaConfig, RobertaTokenizer

In [2]:
mnli = load_dataset("multi_nli")

Found cached dataset multi_nli (/home/students/loeser/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
roberta = RobertaModel.from_pretrained("roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
config = RobertaConfig.from_json_file("../models/sequence_classification.json")
model = RobertaForSequenceClassification(config)

In [5]:
state_dict = roberta.state_dict()
del state_dict["pooler.dense.weight"]
del state_dict["pooler.dense.bias"]
model.roberta.load_state_dict(state_dict)

<All keys matched successfully>

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [7]:
mnli_train = mnli["train"]
# Only use hypothesis
mnli_train = mnli_train.map(lambda d: {"x": d["hypothesis"]}, batched=True)
mnli_train = mnli_train.map(lambda d: tokenizer(d["x"], padding="max_length", truncation=True), batched=True)
mnli_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

In [8]:
mnli_val = mnli["validation_matched"]
# Only use hypothesis
mnli_val = mnli_val.map(lambda d: {"x": d["hypothesis"]}, batched=True) \
    .map(lambda d: tokenizer(d["x"], padding="max_length", truncation=True), batched=True)
mnli_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

In [9]:
def compute_metrics(pred):
    """
    Shows a few helpful metrics and saves them in specified directory
    :param pred: list
    """

    true = pred.label_ids
    predicted = pred.predictions.argmax(-1)

    return {
        "MCC": matthews_corrcoef(true, predicted),
        "F1": f1_score(true, predicted, average='macro'),
        "Acc": accuracy_score(true, predicted),
        "BAcc": balanced_accuracy_score(true, predicted),
    }

In [10]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.0001,               # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=1000,               # log & save weights each logging_steps
    save_steps=1000,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=mnli_train,         # training dataset
    eval_dataset=mnli_val,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [11]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: hypothesis_parse, hypothesis_binary_parse, pairID, premise, premise_binary_parse, hypothesis, promptID, premise_parse, genre, x. If hypothesis_parse, hypothesis_binary_parse, pairID, premise, premise_binary_parse, hypothesis, promptID, premise_parse, genre, x are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 392702
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 147264
  Number of trainable parameters = 124647939


Step,Training Loss,Validation Loss,Mcc,F1,Acc,Bacc
1000,1.0373,0.971715,0.294289,0.526591,0.527254,0.52557


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: hypothesis_parse, hypothesis_binary_parse, pairID, premise, premise_binary_parse, hypothesis, promptID, premise_parse, genre, x. If hypothesis_parse, hypothesis_binary_parse, pairID, premise, premise_binary_parse, hypothesis, promptID, premise_parse, genre, x are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9815
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin


KeyboardInterrupt: 