In [1]:
import json
import torch
import random
import numpy as np
from datasets import load_from_disk
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaConfig
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

from typing import Optional, Union

import evaluate

In [2]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

In [3]:
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'
DATA_PATH = PATH + '5-dataset_construction/5-2-template/output/'

In [4]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
device_name = torch.cuda.get_device_name()
print(f"Using device: {device} ({device_name})")

Using device: cuda:0 (NVIDIA RTX A6000)


In [5]:
# Load dataset.
dataset = load_from_disk(DATA_PATH + 'dataset_5-2_3/')
dataset = dataset.remove_columns('id')
dataset = dataset.remove_columns('prompt')
# dataset = dataset.remove_columns('input_ids')
# dataset = dataset.remove_columns('attention_mask')
# dataset = dataset.with_format('torch')
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4911
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 614
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 615
    })
})

In [6]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(10))
small_dev_dataset = dataset["dev"].shuffle(seed=42).select(range(10))

In [7]:
model_name = 'roberta-large'

In [8]:
# Load tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

In [9]:
print(tokenizer.vocab_size)
tokenizer.special_tokens_map

50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [10]:
special_tokens = {
    'additional_special_tokens': ['[ETYPE]',
                                  '[/ETYPE]',
                                  '[DEF]',
                                  '[/DEF]',
                                  '[EVT]',
                                  '[/EVT]',
                                 ]
}

tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
print("Vocab size:", len(vocab))
tokenizer.special_tokens_map

Vocab size: 50271


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[/ETYPE]',
  '[/DEF]']}

In [11]:
# Speicify configuration for the model.
config_roberta_large = RobertaConfig.from_pretrained(
    model_name,
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
)

config_roberta_large_mnli = RobertaConfig.from_pretrained(
    model_name, 
    num_labels=2, 
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
#     out_proj = torch.nn.Linear(in_features=1024, out_features=2, bias=True),
)

if model_name == 'roberta-large':
    model_config = config_roberta_large
    print("Model config: roberta-large")

if model_name == 'roberta-large-mnli':
    model_config = config_roberta_large_mnli
    print("Model config: roberta-large-mnli")

Model config: roberta-large


In [12]:
# Load model.
model = RobertaForSequenceClassification.from_pretrained(
    model_name, 
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50271, 1024)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [13]:
print(model.config.num_labels)
print(model.config.id2label)
print(model.config.label2id)

2
{0: 'implausible', 1: 'plausible'}
{'implausible': 0, 'plausible': 1}


In [14]:
print(tokenizer.vocab_size)
print(len(tokenizer.vocab))
tokenizer.special_tokens_map

50265
50271


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[/ETYPE]',
  '[/DEF]']}

### Training

In [15]:
# Load evaluation metrics.
auc = evaluate.load('roc_auc')
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')

In [16]:
# Training hyperparameters
num_epochs = 10
batch_size = 16
optimizer = "adamw_torch"
lr = 1e-5
weight_decay = 0.01
warmup_steps = 100   ### 1000 ###

In [17]:
def compute_metrics(y_pred):
    """
    This code snippet is produced by ChatGPT.
    """
    
    logits, labels = y_pred
#     print("y_pred:", y_pred, type(y_pred))
#     print("logits:", logits, type(logits))
#     print("labels:", labels, type(labels))
    
    # Compute accuracy, precision, recall and F1-score.
    predictions = logits.argmax(axis=-1)
#     print("predictions:", predictions)
#     precision, recall, f1, num_each_label_in_y_true = precision_recall_fscore_support(labels, predictions, average='binary')
#     accuracy = accuracy_score(labels, predictions)
    accuracy_score = accuracy.compute(references=labels, predictions=predictions)
#     print("accuracy_score:", accuracy_score)
    
    precision_score = precision.compute(references=labels, predictions=predictions)
#     print("precision_score:", precision_score)
    
    recall_score = recall.compute(references=labels, predictions=predictions)
#     print("recall_score:", recall_score)
    
    f1_score = f1.compute(references=labels, predictions=predictions)
#     print("f1_score:", f1_score)
    
    # Compute AUC.
#     probs = logits[:, 1]
#     print("probs:", probs)
#     auc = roc_auc_score(labels, probs)
    auc_score = auc.compute(references=labels, prediction_scores=predictions)
#     print("auc_score:", auc_score)
    
    return {
        'auc': auc_score['roc_auc'],
        'precision': precision_score['precision'],
        'recall': recall_score['recall'],
        'f1': f1_score['f1'],
        'accuracy': accuracy_score['accuracy'],
    }

In [18]:
# Set early stopping.
earlystopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5, 
    early_stopping_threshold=0.0,
)

In [19]:
training_args = TrainingArguments(
    output_dir='./output/template_evt_ft_1/',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    optim=optimizer,
    learning_rate=lr,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='auc',
    greater_is_better=True,
    remove_unused_columns=False,
)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=True,
group_by_

In [20]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
#     train_dataset=small_train_dataset,
    eval_dataset=dataset['dev'],
#     eval_dataset=small_dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer

<transformers.trainer.Trainer at 0x7f401d8c66e0>

In [21]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Auc,Precision,Recall,F1,Accuracy
1,0.6562,0.582158,0.708469,0.675824,0.801303,0.733234,0.708469
2,0.5167,0.492666,0.7557,0.812749,0.664495,0.731183,0.7557
3,0.3975,0.503204,0.770358,0.756173,0.798046,0.776545,0.770358
4,0.3151,0.632869,0.767101,0.75625,0.788274,0.77193,0.767101
5,0.2594,0.838123,0.758958,0.771331,0.736156,0.753333,0.758958
6,0.22,0.938346,0.7557,0.738602,0.791531,0.764151,0.7557
7,0.1963,0.988496,0.767101,0.759494,0.781759,0.770465,0.767101
8,0.1765,1.21876,0.762215,0.738872,0.811075,0.773292,0.762215


TrainOutput(global_step=2456, training_loss=0.3422026913795098, metrics={'train_runtime': 3162.171, 'train_samples_per_second': 15.53, 'train_steps_per_second': 0.971, 'total_flos': 3.661371940243046e+16, 'train_loss': 0.3422026913795098, 'epoch': 8.0})

In [22]:
model.save_pretrained('./final_ckpt/template_evt_ft_3/')
tokenizer.save_pretrained('./final_ckpt/template_evt_ft_3/')

('./final_ckpt/template_evt_ft_3/tokenizer_config.json',
 './final_ckpt/template_evt_ft_3/special_tokens_map.json',
 './final_ckpt/template_evt_ft_3/vocab.json',
 './final_ckpt/template_evt_ft_3/merges.txt',
 './final_ckpt/template_evt_ft_3/added_tokens.json',
 './final_ckpt/template_evt_ft_3/tokenizer.json')

### Evaluation

In [23]:
# Load test set.
testset = load_from_disk(DATA_PATH + 'testsets_5-2_3/')
testset = testset.remove_columns('id')
testset = testset.remove_columns('prompt')
testset = testset.with_format('torch')
testset

DatasetDict({
    pap: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 308
    })
    pep: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 307
    })
})

In [24]:
pap_test = testset['pap']
pap_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 308
})

In [25]:
pep_test = testset['pep']
pep_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 307
})

### Evaluate the final model checkpoint.

In [26]:
# Predict fine-tuned model on pap_test.
preds_pap_test = trainer.predict(pap_test)
preds_pap_test

PredictionOutput(predictions=array([[-0.09545393,  0.8196105 ],
       [ 0.6981491 , -0.5472973 ],
       [ 0.3171613 ,  0.30046797],
       [ 0.38524115,  0.06493053],
       [ 0.9053168 , -0.09668155],
       [-0.8798623 ,  1.4583045 ],
       [ 0.68735725, -0.18728969],
       [ 1.750799  , -0.9118597 ],
       [-0.70538044,  0.8975759 ],
       [-1.5915138 ,  1.8691651 ],
       [ 0.94754326,  0.04449895],
       [-1.5927913 ,  1.688275  ],
       [ 0.02003617,  0.39700124],
       [ 0.7177704 ,  0.00838428],
       [-0.57297814,  0.9626558 ],
       [-0.91071725,  1.4419339 ],
       [-1.5476074 ,  1.6321563 ],
       [ 0.9972162 , -0.2511214 ],
       [-1.438503  ,  1.4377561 ],
       [ 2.111767  , -1.7555542 ],
       [-1.5547751 ,  1.7440739 ],
       [-0.6345087 ,  1.2370738 ],
       [ 0.3090633 ,  0.28854582],
       [ 0.39515477,  0.40304643],
       [ 0.71520144,  0.00555976],
       [ 0.30996853,  0.0644213 ],
       [-1.6829264 ,  1.6458299 ],
       [-1.1840553 ,  1.16

In [27]:
# Evaluate fine-tuned model on pap_test.
scores_pap_test = trainer.evaluate(pap_test)
scores_pap_test

{'eval_loss': 0.7335963845252991,
 'eval_auc': 0.6363636363636364,
 'eval_precision': 0.64,
 'eval_recall': 0.6233766233766234,
 'eval_f1': 0.6315789473684211,
 'eval_accuracy': 0.6363636363636364,
 'eval_runtime': 7.1921,
 'eval_samples_per_second': 42.825,
 'eval_steps_per_second': 2.781,
 'epoch': 8.0}

In [28]:
# Predict fine-tuned model on pep_test.
preds_pep_test = trainer.predict(pep_test)
preds_pep_test

PredictionOutput(predictions=array([[-3.65860045e-01,  5.66222072e-01],
       [-3.93048882e-01,  1.55445492e+00],
       [-1.40626419e+00,  1.41438174e+00],
       [-1.26462018e+00,  1.89892006e+00],
       [ 1.92773890e+00, -1.70465505e+00],
       [-1.08247650e+00,  1.19990373e+00],
       [ 2.10780978e+00, -1.73036253e+00],
       [ 1.37143183e+00, -1.07860601e+00],
       [ 1.90523112e+00, -1.50187969e+00],
       [-7.54434466e-02,  8.89013290e-01],
       [-2.90899098e-01,  7.40289688e-01],
       [-1.46821296e+00,  1.59438598e+00],
       [ 1.88986981e+00, -1.42740357e+00],
       [ 1.44668257e+00, -1.17420340e+00],
       [ 1.20034397e+00, -9.94521379e-01],
       [ 1.62463319e+00, -1.70547867e+00],
       [ 1.47486973e+00, -1.28142941e+00],
       [-1.35989177e+00,  1.54579127e+00],
       [ 7.97460675e-02,  9.77326870e-01],
       [ 1.73108232e+00, -1.53899527e+00],
       [-1.56644607e+00,  1.56205022e+00],
       [-8.85292411e-01,  1.57412887e+00],
       [ 1.70571297e-01, 

In [29]:
# Evaluate fine-tuned model on pep_test.
scores_pep_test = trainer.evaluate(pep_test)
scores_pep_test

{'eval_loss': 0.3587769865989685,
 'eval_auc': 0.8439224174518292,
 'eval_precision': 0.7932960893854749,
 'eval_recall': 0.9281045751633987,
 'eval_f1': 0.8554216867469879,
 'eval_accuracy': 0.8436482084690554,
 'eval_runtime': 7.207,
 'eval_samples_per_second': 42.597,
 'eval_steps_per_second': 2.775,
 'epoch': 8.0}

In [30]:
softmax = torch.nn.Softmax(dim=1)
softmax

Softmax(dim=1)

In [32]:
probs_pap_test = softmax(torch.tensor(preds_pap_test.predictions)).tolist()

y_pred_pap_test = np.argmax(preds_pap_test.predictions, axis=-1).tolist()

metrics_pap_test = preds_pap_test.metrics
metrics_pap_test['test_loss'] = scores_pap_test['eval_loss']
metrics_pap_test['test_runtime'] = scores_pap_test['eval_runtime']
metrics_pap_test['test_samples_per_second'] = scores_pap_test['eval_samples_per_second']
metrics_pap_test['test_steps_per_second'] = scores_pap_test['eval_steps_per_second']

with open('./evaluation_outputs/template_evt_ft_1/preds_pap.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test, 
               'y_pred': y_pred_pap_test, 
               'y_true': preds_pap_test.label_ids.tolist(), 
               'metrics': metrics_pap_test,
              }, file)

In [33]:
probs_pep_test = softmax(torch.tensor(preds_pep_test.predictions)).tolist()

y_pred_pep_test = np.argmax(preds_pep_test.predictions, axis=-1).tolist()

metrics_pep_test = preds_pep_test.metrics
metrics_pep_test['test_loss'] = scores_pep_test['eval_loss']
metrics_pep_test['test_runtime'] = scores_pep_test['eval_runtime']
metrics_pep_test['test_samples_per_second'] = scores_pep_test['eval_samples_per_second']
metrics_pep_test['test_steps_per_second'] = scores_pep_test['eval_steps_per_second']

with open('./evaluation_outputs/template_evt_ft_1/preds_pep.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test, 
               'y_pred': y_pred_pep_test, 
               'y_true': preds_pep_test.label_ids.tolist(), 
               'metrics': metrics_pep_test,
              }, file)

### Load model checkpoint with lowest validation loss.
In the training_args above, I set metric_for_best_model='auc'. This is suboptimal since the model checkpoint, for which the validation loss is the lowest and is about to increase, should be considered the best model checkpoint.

In [35]:
tokenizer_best = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path='./output/template_evt_ft_1/checkpoint-614/',
    cache_dir=CACHE_DIR,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [36]:
print(tokenizer_best.vocab_size)
print(len(tokenizer_best.vocab))
tokenizer_best.special_tokens_map

50265
50271


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[/ETYPE]',
  '[/DEF]']}

In [37]:
print(model_config.vocab_size)
model_config.vocab_size = len(tokenizer_best)
print(model_config.vocab_size)

50265
50271


In [38]:
model_best = RobertaForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path='./output/template_evt_ft_1/checkpoint-921/',
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)
model_best.resize_token_embeddings(len(tokenizer_best))
model_best = model_best.to(device)
model_best.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50271, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [39]:
trainer_eval = Trainer(
    model=model_best,
    tokenizer=tokenizer_best,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer_eval

<transformers.trainer.Trainer at 0x7f4016239960>

In [44]:
# Predict the best checkpoint (the one with the lowest validation error) on pap_test.
preds_pap_best_model = trainer_eval.predict(pap_test)
preds_pap_best_model

PredictionOutput(predictions=array([[-0.09545393,  0.8196105 ],
       [ 0.6981491 , -0.5472973 ],
       [ 0.3171613 ,  0.30046797],
       [ 0.38524115,  0.06493053],
       [ 0.9053168 , -0.09668155],
       [-0.8798623 ,  1.4583045 ],
       [ 0.68735725, -0.18728969],
       [ 1.750799  , -0.9118597 ],
       [-0.70538044,  0.8975759 ],
       [-1.5915138 ,  1.8691651 ],
       [ 0.94754326,  0.04449895],
       [-1.5927913 ,  1.688275  ],
       [ 0.02003617,  0.39700124],
       [ 0.7177704 ,  0.00838428],
       [-0.57297814,  0.9626558 ],
       [-0.91071725,  1.4419339 ],
       [-1.5476074 ,  1.6321563 ],
       [ 0.9972162 , -0.2511214 ],
       [-1.438503  ,  1.4377561 ],
       [ 2.111767  , -1.7555542 ],
       [-1.5547751 ,  1.7440739 ],
       [-0.6345087 ,  1.2370738 ],
       [ 0.3090633 ,  0.28854582],
       [ 0.39515477,  0.40304643],
       [ 0.71520144,  0.00555976],
       [ 0.30996853,  0.0644213 ],
       [-1.6829264 ,  1.6458299 ],
       [-1.1840553 ,  1.16

In [45]:
# Evaluate the best checkpoint (the one with the lowest validation error) on pap_test.
scores_pap_best_model = trainer_eval.evaluate(pap_test)
scores_pap_best_model

{'eval_loss': 0.7335963845252991,
 'eval_auc': 0.6363636363636364,
 'eval_precision': 0.64,
 'eval_recall': 0.6233766233766234,
 'eval_f1': 0.6315789473684211,
 'eval_accuracy': 0.6363636363636364,
 'eval_runtime': 7.1942,
 'eval_samples_per_second': 42.812,
 'eval_steps_per_second': 2.78}

In [46]:
probs_pap_test_best_model = softmax(torch.tensor(preds_pap_best_model.predictions)).tolist()

y_pred_pap_test_best_model = np.argmax(preds_pap_best_model.predictions, axis=-1).tolist()

metrics_pap_test_best_model = preds_pap_best_model.metrics
metrics_pap_test_best_model['test_loss'] = scores_pap_best_model['eval_loss']
metrics_pap_test_best_model['test_runtime'] = scores_pap_best_model['eval_runtime']
metrics_pap_test_best_model['test_samples_per_second'] = scores_pap_best_model['eval_samples_per_second']
metrics_pap_test_best_model['test_steps_per_second'] = scores_pap_best_model['eval_steps_per_second']

with open('./evaluation_outputs/template_evt_ft_1/preds_pap_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test_best_model, 
               'y_pred': y_pred_pap_test_best_model, 
               'y_true': preds_pap_best_model.label_ids.tolist(), 
               'metrics': metrics_pap_test_best_model,
              }, file)

In [47]:
# Predict the best checkpoint (the one with the highest auc) on pep_test.
preds_pep_best_model = trainer_eval.predict(pep_test)
preds_pep_best_model

PredictionOutput(predictions=array([[-3.65860045e-01,  5.66222072e-01],
       [-3.93048882e-01,  1.55445492e+00],
       [-1.40626419e+00,  1.41438174e+00],
       [-1.26462018e+00,  1.89892006e+00],
       [ 1.92773890e+00, -1.70465505e+00],
       [-1.08247650e+00,  1.19990373e+00],
       [ 2.10780978e+00, -1.73036253e+00],
       [ 1.37143183e+00, -1.07860601e+00],
       [ 1.90523112e+00, -1.50187969e+00],
       [-7.54434466e-02,  8.89013290e-01],
       [-2.90899098e-01,  7.40289688e-01],
       [-1.46821296e+00,  1.59438598e+00],
       [ 1.88986981e+00, -1.42740357e+00],
       [ 1.44668257e+00, -1.17420340e+00],
       [ 1.20034397e+00, -9.94521379e-01],
       [ 1.62463319e+00, -1.70547867e+00],
       [ 1.47486973e+00, -1.28142941e+00],
       [-1.35989177e+00,  1.54579127e+00],
       [ 7.97460675e-02,  9.77326870e-01],
       [ 1.73108232e+00, -1.53899527e+00],
       [-1.56644607e+00,  1.56205022e+00],
       [-8.85292411e-01,  1.57412887e+00],
       [ 1.70571297e-01, 

In [49]:
# Evaluate the best checkpoint (the one with the highest auc) on pep_test.
scores_pep_best_model = trainer_eval.evaluate(pep_test)
scores_pep_best_model

{'eval_loss': 0.3587769865989685,
 'eval_auc': 0.8439224174518292,
 'eval_precision': 0.7932960893854749,
 'eval_recall': 0.9281045751633987,
 'eval_f1': 0.8554216867469879,
 'eval_accuracy': 0.8436482084690554,
 'eval_runtime': 7.1696,
 'eval_samples_per_second': 42.82,
 'eval_steps_per_second': 2.79}

In [50]:
probs_pep_test_best_model = softmax(torch.tensor(preds_pep_best_model.predictions)).tolist()

y_pred_pep_test_best_model = np.argmax(preds_pep_best_model.predictions, axis=-1).tolist()

metrics_pep_test_best_model = preds_pep_best_model.metrics
metrics_pep_test_best_model['test_loss'] = scores_pep_best_model['eval_loss']
metrics_pep_test_best_model['test_runtime'] = scores_pep_best_model['eval_runtime']
metrics_pep_test_best_model['test_samples_per_second'] = scores_pep_best_model['eval_samples_per_second']
metrics_pep_test_best_model['test_steps_per_second'] = scores_pep_best_model['eval_steps_per_second']

with open('./evaluation_outputs/template_evt_ft_1/preds_pep_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test_best_model, 
               'y_pred': y_pred_pep_test_best_model, 
               'y_true': preds_pep_best_model.label_ids.tolist(), 
               'metrics': metrics_pep_test_best_model,
              }, file)