In [1]:
import json
import torch
import random
import numpy as np
from datasets import load_from_disk
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaConfig
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

from typing import Optional, Union

import evaluate

In [2]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

In [3]:
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'
DATA_PATH = PATH + '5-dataset_construction/5-2-template/output/'

In [4]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
device_name = torch.cuda.get_device_name()
print(f"Using device: {device} ({device_name})")

Using device: cuda:0 (NVIDIA RTX A6000)


In [5]:
# Load dataset.
dataset = load_from_disk(DATA_PATH + 'dataset_5-2_4/')
dataset = dataset.remove_columns('id')
dataset = dataset.remove_columns('prompt')
# dataset = dataset.remove_columns('input_ids')
# dataset = dataset.remove_columns('attention_mask')
dataset = dataset.with_format('torch')
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4911
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 614
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 615
    })
})

In [6]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(10))
small_dev_dataset = dataset["dev"].shuffle(seed=42).select(range(10))

In [7]:
model_name = 'roberta-large'

In [8]:
# Load tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

In [9]:
print(tokenizer.vocab_size)
tokenizer.special_tokens_map

50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [10]:
special_tokens = {
    'additional_special_tokens': ['[STYPE]',
                                  '[/STYPE]',
                                  '[OTYPE]',
                                  '[/OTYPE]',
                                  '[DEF]',
                                  '[/DEF]',
                                  '[EVT]',
                                  '[/EVT]',
                                 ]
}

tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
print("Vocab size:", len(vocab))
tokenizer.special_tokens_map

Vocab size: 50273


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[/STYPE]',
  '[OTYPE]',
  '[/OTYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[STYPE]',
  '[/DEF]']}

In [11]:
# Speicify configuration for the model.
config_roberta_large = RobertaConfig.from_pretrained(
    model_name,
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
)

config_roberta_large_mnli = RobertaConfig.from_pretrained(
    model_name, 
    num_labels=2, 
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
#     out_proj = torch.nn.Linear(in_features=1024, out_features=2, bias=True),
)

if model_name == 'roberta-large':
    model_config = config_roberta_large
    print("Model config: roberta-large")

if model_name == 'roberta-large-mnli':
    model_config = config_roberta_large_mnli
    print("Model config: roberta-large-mnli")

Model config: roberta-large


In [12]:
# Load model.
model = RobertaForSequenceClassification.from_pretrained(
    model_name, 
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50273, 1024)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [13]:
print(model.config.num_labels)
print(model.config.id2label)
print(model.config.label2id)

2
{0: 'implausible', 1: 'plausible'}
{'implausible': 0, 'plausible': 1}


In [14]:
print(tokenizer.vocab_size)
print(len(tokenizer.vocab))
tokenizer.special_tokens_map

50265
50273


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[/STYPE]',
  '[OTYPE]',
  '[/OTYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[STYPE]',
  '[/DEF]']}

### Training

In [15]:
# Load evaluation metrics.
auc = evaluate.load('roc_auc')
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')

In [16]:
# Training hyperparameters
num_epochs = 10
batch_size = 16
optimizer = "adamw_torch"
lr = 1e-5
weight_decay = 0.01
warmup_steps = 10

In [17]:
def compute_metrics(y_pred):
    """
    This code snippet is produced by ChatGPT.
    """
    
    logits, labels = y_pred
#     print("y_pred:", y_pred, type(y_pred))
#     print("logits:", logits, type(logits))
#     print("labels:", labels, type(labels))
    
    # Compute accuracy, precision, recall and F1-score.
    predictions = logits.argmax(axis=-1)
#     print("predictions:", predictions)
#     precision, recall, f1, num_each_label_in_y_true = precision_recall_fscore_support(labels, predictions, average='binary')
#     accuracy = accuracy_score(labels, predictions)
    accuracy_score = accuracy.compute(references=labels, predictions=predictions)
#     print("accuracy_score:", accuracy_score)
    
    precision_score = precision.compute(references=labels, predictions=predictions)
#     print("precision_score:", precision_score)
    
    recall_score = recall.compute(references=labels, predictions=predictions)
#     print("recall_score:", recall_score)
    
    f1_score = f1.compute(references=labels, predictions=predictions)
#     print("f1_score:", f1_score)
    
    # Compute AUC.
#     probs = logits[:, 1]
#     print("probs:", probs)
#     auc = roc_auc_score(labels, probs)
    auc_score = auc.compute(references=labels, prediction_scores=predictions)
#     print("auc_score:", auc_score)
    
    return {
        'auc': auc_score['roc_auc'],
        'precision': precision_score['precision'],
        'recall': recall_score['recall'],
        'f1': f1_score['f1'],
        'accuracy': accuracy_score['accuracy'],
    }

In [18]:
# Set early stopping.
earlystopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5, 
    early_stopping_threshold=0.0,
)

In [19]:
training_args = TrainingArguments(
    output_dir='./output/template_ent_ft_1/',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    optim=optimizer,
    learning_rate=lr,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='auc',
    greater_is_better=True,
    remove_unused_columns=False,
)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=True,
group_by_

In [20]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
#     train_dataset=small_train_dataset,
    eval_dataset=dataset['dev'],
#     eval_dataset=small_dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer

<transformers.trainer.Trainer at 0x7f53ff8112a0>

In [21]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Auc,Precision,Recall,F1,Accuracy
1,0.6564,0.556655,0.723127,0.720257,0.729642,0.724919,0.723127
2,0.5141,0.521438,0.7443,0.802419,0.648208,0.717117,0.7443
3,0.3934,0.523073,0.760586,0.78777,0.713355,0.748718,0.760586
4,0.3172,0.629003,0.765472,0.743284,0.811075,0.775701,0.765472
5,0.2562,0.797057,0.745928,0.749175,0.739414,0.744262,0.745928
6,0.2147,1.006404,0.742671,0.765125,0.700326,0.731293,0.742671
7,0.1969,0.960268,0.750814,0.763699,0.726384,0.744574,0.750814
8,0.1732,1.248359,0.754072,0.758278,0.745928,0.752053,0.754072
9,0.1581,1.377525,0.757329,0.754839,0.762215,0.758509,0.757329


TrainOutput(global_step=2763, training_loss=0.3200197968843662, metrics={'train_runtime': 3697.3145, 'train_samples_per_second': 13.283, 'train_steps_per_second': 0.83, 'total_flos': 4.119043432773427e+16, 'train_loss': 0.3200197968843662, 'epoch': 9.0})

In [22]:
model.save_pretrained('./final_ckpt/template_ent_ft_1/')
tokenizer.save_pretrained('./final_ckpt/template_ent_ft_1/')

('./final_ckpt/template_ent_ft_1/tokenizer_config.json',
 './final_ckpt/template_ent_ft_1/special_tokens_map.json',
 './final_ckpt/template_ent_ft_1/vocab.json',
 './final_ckpt/template_ent_ft_1/merges.txt',
 './final_ckpt/template_ent_ft_1/added_tokens.json',
 './final_ckpt/template_ent_ft_1/tokenizer.json')

### Evaluation

In [23]:
# Load test set.
testset = load_from_disk(DATA_PATH + 'testsets_5-2_4/')
testset = testset.remove_columns('id')
testset = testset.remove_columns('prompt')
testset = testset.with_format('torch')
testset

DatasetDict({
    pap: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 308
    })
    pep: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 307
    })
})

In [24]:
pap_test = testset['pap']
pap_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 308
})

In [25]:
pep_test = testset['pep']
pep_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 307
})

### Evaluate the final model checkpoint.

In [26]:
# Predict fine-tuned model on pap_test.
preds_pap_test = trainer.predict(pap_test)
preds_pap_test

PredictionOutput(predictions=array([[-4.73561913e-01,  1.10336852e+00],
       [ 7.56607234e-01, -1.10628650e-01],
       [ 1.11311167e-01,  4.14863259e-01],
       [-8.98470078e-03,  5.09271443e-01],
       [ 7.64729917e-01, -1.40330538e-01],
       [-1.62547445e+00,  2.35714936e+00],
       [ 6.38178587e-01,  5.03672361e-02],
       [ 2.13039351e+00, -1.44637322e+00],
       [-8.73845160e-01,  1.19736779e+00],
       [-2.66495371e+00,  3.24850726e+00],
       [ 1.07960391e+00, -4.39836532e-01],
       [-2.29465079e+00,  2.69149113e+00],
       [-5.54651320e-02,  6.40702188e-01],
       [ 2.87066698e-01,  2.68618882e-01],
       [-2.18685403e-01,  6.68370664e-01],
       [ 9.78627801e-02,  5.13027847e-01],
       [-2.11057568e+00,  2.77819180e+00],
       [ 8.69760215e-01, -3.60649139e-01],
       [-2.06378031e+00,  2.18618274e+00],
       [ 2.69792175e+00, -2.20688009e+00],
       [-1.85717165e+00,  2.43586206e+00],
       [-1.69785810e+00,  2.05402732e+00],
       [ 7.94592679e-01, 

In [27]:
# Evaluate fine-tuned model on pap_test.
scores_pap_test = trainer.evaluate(pap_test)
scores_pap_test

{'eval_loss': 0.8566758632659912,
 'eval_auc': 0.6493506493506493,
 'eval_precision': 0.6455696202531646,
 'eval_recall': 0.6623376623376623,
 'eval_f1': 0.6538461538461539,
 'eval_accuracy': 0.6493506493506493,
 'eval_runtime': 7.3687,
 'eval_samples_per_second': 41.798,
 'eval_steps_per_second': 2.714,
 'epoch': 9.0}

In [28]:
# Predict fine-tuned model on pep_test.
preds_pep_test = trainer.predict(pep_test)
preds_pep_test

PredictionOutput(predictions=array([[-5.4327196e-01,  7.9589498e-01],
       [ 1.3535860e+00, -7.7280217e-01],
       [-2.4280555e+00,  3.4010165e+00],
       [-2.8134615e+00,  3.2646658e+00],
       [-7.1644861e-01,  1.1302695e+00],
       [-2.5892820e+00,  3.3576794e+00],
       [ 2.6626935e+00, -2.6540132e+00],
       [ 2.0234661e+00, -1.7855581e+00],
       [ 2.5040255e+00, -2.3652720e+00],
       [-1.6237352e+00,  2.4505975e+00],
       [-2.8294227e-01,  5.4638338e-01],
       [-1.7662029e+00,  2.4977202e+00],
       [ 1.1347661e+00, -6.3995832e-01],
       [ 2.5192387e+00, -2.4573789e+00],
       [ 2.6117771e+00, -2.4607151e+00],
       [ 1.6991653e+00, -2.1513519e+00],
       [ 2.2395124e+00, -1.9932567e+00],
       [-1.7015889e+00,  2.2918241e+00],
       [ 2.2610312e+00, -1.8331885e+00],
       [ 2.6206846e+00, -2.5835853e+00],
       [-2.9265842e+00,  3.2704711e+00],
       [-2.4049718e+00,  3.2720337e+00],
       [-1.8715892e+00,  2.5682459e+00],
       [-2.9196229e+00,  3.7

In [29]:
# Evaluate fine-tuned model on pep_test.
scores_pep_test = trainer.evaluate(pep_test)
scores_pep_test

{'eval_loss': 0.46809372305870056,
 'eval_auc': 0.8502673796791443,
 'eval_precision': 0.8282208588957055,
 'eval_recall': 0.8823529411764706,
 'eval_f1': 0.8544303797468354,
 'eval_accuracy': 0.8501628664495114,
 'eval_runtime': 7.3531,
 'eval_samples_per_second': 41.751,
 'eval_steps_per_second': 2.72,
 'epoch': 9.0}

In [30]:
softmax = torch.nn.Softmax(dim=1)
softmax

Softmax(dim=1)

In [31]:
probs_pap_test = softmax(torch.tensor(preds_pap_test.predictions)).tolist()

y_pred_pap_test = np.argmax(preds_pap_test.predictions, axis=-1).tolist()

metrics_pap_test = preds_pap_test.metrics
metrics_pap_test['test_loss'] = scores_pap_test['eval_loss']
metrics_pap_test['test_runtime'] = scores_pap_test['eval_runtime']
metrics_pap_test['test_samples_per_second'] = scores_pap_test['eval_samples_per_second']
metrics_pap_test['test_steps_per_second'] = scores_pap_test['eval_steps_per_second']

with open('./evaluation_outputs/template_ent_ft_1/preds_pap.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test, 
               'y_pred': y_pred_pap_test, 
               'y_true': preds_pap_test.label_ids.tolist(), 
               'metrics': metrics_pap_test,
              }, file)

In [32]:
probs_pep_test = softmax(torch.tensor(preds_pep_test.predictions)).tolist()

y_pred_pep_test = np.argmax(preds_pep_test.predictions, axis=-1).tolist()

metrics_pep_test = preds_pep_test.metrics
metrics_pep_test['test_loss'] = scores_pep_test['eval_loss']
metrics_pep_test['test_runtime'] = scores_pep_test['eval_runtime']
metrics_pep_test['test_samples_per_second'] = scores_pep_test['eval_samples_per_second']
metrics_pep_test['test_steps_per_second'] = scores_pep_test['eval_steps_per_second']

with open('./evaluation_outputs/template_ent_ft_1/preds_pep.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test, 
               'y_pred': y_pred_pep_test, 
               'y_true': preds_pep_test.label_ids.tolist(), 
               'metrics': metrics_pep_test,
              }, file)

### Load model checkpoint with lowest validation loss.
In the training_args above, I set metric_for_best_model='auc'. This is suboptimal since the model checkpoint, for which the validation loss is the lowest and is about to increase, should be considered the best model checkpoint.

In [33]:
tokenizer_best = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path='./output/template_ent_ft_1/checkpoint-614/',
    cache_dir=CACHE_DIR,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
print(tokenizer_best.vocab_size)
print(len(tokenizer_best.vocab))
tokenizer_best.special_tokens_map

50265
50273


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[/STYPE]',
  '[OTYPE]',
  '[/OTYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[STYPE]',
  '[/DEF]']}

In [35]:
print(model_config.vocab_size)
model_config.vocab_size = len(tokenizer_best)
print(model_config.vocab_size)

50265
50273


In [36]:
model_best = RobertaForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path='./output/template_ent_ft_1/checkpoint-614/',
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)
model_best.resize_token_embeddings(len(tokenizer_best))
model_best = model_best.to(device)
model_best.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50273, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [37]:
trainer_eval = Trainer(
    model=model_best,
    tokenizer=tokenizer_best,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer_eval

<transformers.trainer.Trainer at 0x7f53fc0c61a0>

In [38]:
# Predict the best checkpoint (the one with the lowest validation error) on pap_test.
preds_pap_best_model = trainer_eval.predict(pap_test)
preds_pap_best_model

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


PredictionOutput(predictions=array([[ 4.40472275e-01,  2.47462258e-01],
       [ 1.20184290e+00, -3.89434218e-01],
       [ 8.35068703e-01, -1.22283451e-01],
       [ 6.44237638e-01, -5.27993217e-02],
       [ 8.20061564e-01, -1.45730317e-01],
       [-2.12975159e-01,  1.46935773e+00],
       [ 1.24904895e+00, -5.17199278e-01],
       [ 1.89732134e+00, -9.37673509e-01],
       [-2.21561760e-01,  9.15698946e-01],
       [-1.95727551e+00,  2.57822299e+00],
       [ 1.31493044e+00, -6.29285395e-01],
       [-9.09452379e-01,  1.53305519e+00],
       [ 7.41549730e-01, -1.79090321e-01],
       [ 8.98041368e-01, -2.47451454e-01],
       [-6.51953891e-02,  6.22790992e-01],
       [ 9.27866757e-01, -2.67001837e-01],
       [-1.26267409e+00,  1.91313851e+00],
       [ 7.62468815e-01,  1.63326040e-02],
       [-7.70490885e-01,  1.11057341e+00],
       [ 2.06698751e+00, -1.34987891e+00],
       [-8.26475561e-01,  1.62550151e+00],
       [ 2.63329417e-01,  3.30810040e-01],
       [ 5.27037799e-01, 

In [39]:
# Evaluate the best checkpoint (the one with the lowest validation error) on pap_test.
scores_pap_best_model = trainer_eval.evaluate(pap_test)
scores_pap_best_model

{'eval_loss': 0.6579061150550842,
 'eval_auc': 0.6655844155844156,
 'eval_precision': 0.7628865979381443,
 'eval_recall': 0.4805194805194805,
 'eval_f1': 0.5896414342629482,
 'eval_accuracy': 0.6655844155844156,
 'eval_runtime': 7.3595,
 'eval_samples_per_second': 41.851,
 'eval_steps_per_second': 2.718}

In [40]:
probs_pap_test_best_model = softmax(torch.tensor(preds_pap_best_model.predictions)).tolist()

y_pred_pap_test_best_model = np.argmax(preds_pap_best_model.predictions, axis=-1).tolist()

metrics_pap_test_best_model = preds_pap_best_model.metrics
metrics_pap_test_best_model['test_loss'] = scores_pap_best_model['eval_loss']
metrics_pap_test_best_model['test_runtime'] = scores_pap_best_model['eval_runtime']
metrics_pap_test_best_model['test_samples_per_second'] = scores_pap_best_model['eval_samples_per_second']
metrics_pap_test_best_model['test_steps_per_second'] = scores_pap_best_model['eval_steps_per_second']

with open('./evaluation_outputs/template_ent_ft_1/preds_pap_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test_best_model, 
               'y_pred': y_pred_pap_test_best_model, 
               'y_true': preds_pap_best_model.label_ids.tolist(), 
               'metrics': metrics_pap_test_best_model,
              }, file)

In [41]:
# Predict the best checkpoint (the one with the highest auc) on pep_test.
preds_pep_best_model = trainer_eval.predict(pep_test)
preds_pep_best_model

PredictionOutput(predictions=array([[-0.21783298,  0.26344204],
       [ 0.17869961,  0.25687966],
       [-1.133978  ,  2.0279145 ],
       [-0.7179218 ,  1.5097646 ],
       [ 1.2804841 , -0.9466778 ],
       [-1.2093849 ,  2.46459   ],
       [ 2.3462687 , -1.8425871 ],
       [ 0.5123263 , -0.09697188],
       [ 1.9275458 , -1.100197  ],
       [-1.9444183 ,  2.5632706 ],
       [-0.25179377,  0.609434  ],
       [-1.0535184 ,  1.8768914 ],
       [ 1.5816312 , -1.0902127 ],
       [ 1.37652   , -0.8296985 ],
       [ 1.5928031 , -0.9937654 ],
       [ 1.8059907 , -1.7113189 ],
       [ 1.3179221 , -0.77280784],
       [-0.2584227 ,  0.9936443 ],
       [ 0.18360998,  0.53902143],
       [ 1.8271334 , -1.1443833 ],
       [-1.8736788 ,  2.348386  ],
       [-0.982753  ,  1.8265766 ],
       [-0.06713235,  0.84450096],
       [-1.2919563 ,  2.6023097 ],
       [ 1.1991544 , -0.62400204],
       [ 0.13417959,  0.65853256],
       [-2.0429995 ,  2.7301261 ],
       [ 1.1178931 , -0.61

In [42]:
# Evaluate the best checkpoint (the one with the highest auc) on pep_test.
scores_pep_best_model = trainer_eval.evaluate(pep_test)
scores_pep_best_model

{'eval_loss': 0.3879683315753937,
 'eval_auc': 0.8403997962821493,
 'eval_precision': 0.8376623376623377,
 'eval_recall': 0.8431372549019608,
 'eval_f1': 0.8403908794788273,
 'eval_accuracy': 0.8403908794788274,
 'eval_runtime': 7.3985,
 'eval_samples_per_second': 41.495,
 'eval_steps_per_second': 2.703}

In [43]:
probs_pep_test_best_model = softmax(torch.tensor(preds_pep_best_model.predictions)).tolist()

y_pred_pep_test_best_model = np.argmax(preds_pep_best_model.predictions, axis=-1).tolist()

metrics_pep_test_best_model = preds_pep_best_model.metrics
metrics_pep_test_best_model['test_loss'] = scores_pep_best_model['eval_loss']
metrics_pep_test_best_model['test_runtime'] = scores_pep_best_model['eval_runtime']
metrics_pep_test_best_model['test_samples_per_second'] = scores_pep_best_model['eval_samples_per_second']
metrics_pep_test_best_model['test_steps_per_second'] = scores_pep_best_model['eval_steps_per_second']

with open('./evaluation_outputs/template_ent_ft_1/preds_pep_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test_best_model, 
               'y_pred': y_pred_pep_test_best_model, 
               'y_true': preds_pep_best_model.label_ids.tolist(), 
               'metrics': metrics_pep_test_best_model,
              }, file)