In [26]:
import os, json
import torch
import ft_utils
import transformers
import datasets
import evaluate
import numpy as np
from tqdm import tqdm
from datasets import load_from_disk
from transformers import AutoTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from typing import Optional, Union
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support

In [2]:
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'
DATA_PATH = PATH + '5-dataset_construction/5-2-template/output/'

In [3]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
device_name = torch.cuda.get_device_name()
print(f"Using device: {device} ({device_name})")

Using device: cuda:0 (NVIDIA RTX A6000)


In [4]:
torch.cuda.empty_cache()

In [5]:
# Load dataset.
dataset = load_from_disk(DATA_PATH + 'dataset_5-2_1/')
dataset = dataset.remove_columns('id')
dataset = dataset.with_format('torch')
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4911
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 614
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 615
    })
})

In [6]:
# def adjust_embedding_dim(data_split):
#     data_split['labels'] = data_split['labels'].unsqueeze(0)
#     return data_split

# dataset = dataset.map(adjust_embedding_dim)
# dataset

In [7]:
# Load tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained('roberta-large-mnli', cache_dir=CACHE_DIR)
model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli', cache_dir=CACHE_DIR)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
print(tokenizer.vocab_size)
tokenizer.special_tokens_map

50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [9]:
special_tokens = {
    'additional_special_tokens': ['[STYPE]',
                                  '[/STYPE]',
                                  '[ETYPE]',
                                  '[/ETYPE]',
                                  '[OTYPE]',
                                  '[/OTYPE]',
                                  '[DEF]',
                                  '[/DEF]',
                                  '[EVT]',
                                  '[/EVT]',
                                 ]
}

tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))
model.classifier.out_proj = torch.nn.Linear(in_features=1024, out_features=2, bias=True)
model.num_labels = 2

vocab = tokenizer.get_vocab()
print("Vocab size:", len(vocab))
tokenizer.special_tokens_map

Vocab size: 50275


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[/STYPE]',
  '[OTYPE]',
  '[/OTYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[STYPE]',
  '[/ETYPE]',
  '[/DEF]']}

In [10]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50275, 1024)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

### Training

In [11]:
# Training hyperparameters
num_epochs = 10
batch_size = 8
optimizer = "adamw_torch"
lr = 1e-3
weight_decay = 0.01
warmup_steps = 10

In [12]:
# def compute_auc(y_pred):
#     """
#     This code snippet is produced by ChatGPT.
#     """
    
#     logits, labels = y_pred
#     predictions = logits[:, 1]
#     auc = roc_auc_score(labels, predictions)
#     return {"auc": auc}

In [30]:
def compute_metrics(y_pred):
    """
    This code snippet is produced by ChatGPT.
    """
    
    logits, labels = y_pred
    
    # Compute precision, recall and F1-score.
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, num_each_label_in_y_true = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    
    # Compute AUC.
    probs = logits[:, 1]
    auc = roc_auc_score(labels, probs)
    
    return {
        'auc': auc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy
    }

In [14]:
earlystopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5, 
    early_stopping_threshold=0.0,
)

In [15]:
training_args = TrainingArguments(
    output_dir='./output/dataset_5-2_1_3/',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    optim=optimizer,
    learning_rate=lr,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='auc',
    greater_is_better=True,
    remove_unused_columns=False,
)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=True,
group_by_

In [17]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer

<transformers.trainer.Trainer at 0x7f24ffd50dc0>

In [18]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Auc,Precision,Recall,F1,Num Each Label In Y True
1,0.7738,0.694467,0.503889,0.5,1.0,0.666667,
2,0.7704,0.864942,0.473597,0.0,0.0,0.0,
3,0.7593,0.722965,0.520647,0.0,0.0,0.0,
4,0.7595,0.738356,0.508143,0.0,0.0,0.0,
5,0.7402,0.740607,0.492557,0.0,0.0,0.0,
6,0.7423,0.724763,0.503549,0.0,0.0,0.0,
7,0.729,0.779132,0.504886,0.5,1.0,0.666667,
8,0.7233,0.71442,0.479984,0.5,1.0,0.666667,


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=4912, training_loss=0.7497208623233758, metrics={'train_runtime': 3829.3351, 'train_samples_per_second': 12.825, 'train_steps_per_second': 1.603, 'total_flos': 3.661371940243046e+16, 'train_loss': 0.7497208623233758, 'epoch': 8.0})

In [19]:
model.save_pretrained('./final_ckpt/dataset_5-2_1_3/')
tokenizer.save_pretrained('./final_ckpt/dataset_5-2_1_3/')

('./final_ckpt/dataset_5-2_1_3/tokenizer_config.json',
 './final_ckpt/dataset_5-2_1_3/special_tokens_map.json',
 './final_ckpt/dataset_5-2_1_3/vocab.json',
 './final_ckpt/dataset_5-2_1_3/merges.txt',
 './final_ckpt/dataset_5-2_1_3/added_tokens.json',
 './final_ckpt/dataset_5-2_1_3/tokenizer.json')

In [32]:
eval_performance = trainer.evaluate(dataset['test'])
eval_performance

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7225662469863892,
 'eval_auc': 0.49787427556157204,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_num_each_label_in_y_true': None,
 'eval_runtime': 17.0434,
 'eval_samples_per_second': 36.084,
 'eval_steps_per_second': 4.518,
 'epoch': 8.0}

In [25]:
# with open('./scores/eval_scores.json', 'w') as file:
#     json.dump(eval_performance, file)