In [1]:
import json
import torch
import random
import numpy as np
from datasets import load_from_disk
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, RobertaForSequenceClassification, RobertaConfig
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

from typing import Optional, Union

import evaluate

In [2]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(0)

In [3]:
PATH = '/mount/studenten/arbeitsdaten-studenten1/semantic-plausibility/plausible-parrots/'
CACHE_DIR = PATH + 'cache/'
DATA_PATH = PATH + '5-dataset_construction/5-2-template/output/'

In [4]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
device_name = torch.cuda.get_device_name()
print(f"Using device: {device} ({device_name})")

Using device: cuda:0 (NVIDIA RTX A6000)


In [5]:
# Load dataset.
dataset = load_from_disk(DATA_PATH + 'dataset_5-2_1/')
dataset = dataset.remove_columns('id')
dataset = dataset.remove_columns('prompt')
# dataset = dataset.remove_columns('input_ids')
# dataset = dataset.remove_columns('attention_mask')
dataset = dataset.with_format('torch')
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4911
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 614
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 615
    })
})

In [6]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(10))
small_dev_dataset = dataset["dev"].shuffle(seed=42).select(range(10))

In [7]:
model_name = 'roberta-large'

In [8]:
# Load tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

In [9]:
print(tokenizer.vocab_size)
tokenizer.special_tokens_map

50265


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [10]:
special_tokens = {
    'additional_special_tokens': ['[STYPE]',
                                  '[/STYPE]',
                                  '[ETYPE]',
                                  '[/ETYPE]',
                                  '[OTYPE]',
                                  '[/OTYPE]',
                                  '[DEF]',
                                  '[/DEF]',
                                  '[EVT]',
                                  '[/EVT]',
                                 ]
}

tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
print("Vocab size:", len(vocab))
tokenizer.special_tokens_map

Vocab size: 50275


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[/STYPE]',
  '[OTYPE]',
  '[/OTYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[STYPE]',
  '[/ETYPE]',
  '[/DEF]']}

In [11]:
# Speicify configuration for the model.
config_roberta_large = RobertaConfig.from_pretrained(
    model_name,
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
)

config_roberta_large_mnli = RobertaConfig.from_pretrained(
    model_name, 
    num_labels=2, 
    id2label={0: 'implausible', 1: 'plausible'},
    label2id={'implausible': 0, 'plausible': 1},
#     out_proj = torch.nn.Linear(in_features=1024, out_features=2, bias=True),
)

if model_name == 'roberta-large':
    model_config = config_roberta_large
    print("Model config: roberta-large")

if model_name == 'roberta-large-mnli':
    model_config = config_roberta_large_mnli
    print("Model config: roberta-large-mnli")

Model config: roberta-large


In [12]:
# Load model.
model = RobertaForSequenceClassification.from_pretrained(
    model_name, 
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50275, 1024)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [13]:
print(model.config.num_labels)
print(model.config.id2label)
print(model.config.label2id)

2
{0: 'implausible', 1: 'plausible'}
{'implausible': 0, 'plausible': 1}


In [14]:
print(tokenizer.vocab_size)
print(len(tokenizer.vocab))
tokenizer.special_tokens_map

50265
50275


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[/STYPE]',
  '[OTYPE]',
  '[/OTYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[STYPE]',
  '[/ETYPE]',
  '[/DEF]']}

### Training

In [15]:
# Load evaluation metrics.
auc = evaluate.load('roc_auc')
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')

In [16]:
# Training hyperparameters
num_epochs = 10
batch_size = 16
optimizer = "adamw_torch"
lr = 1e-5
weight_decay = 0.01
warmup_steps = 10

In [17]:
def compute_metrics(y_pred):
    """
    This code snippet is produced by ChatGPT.
    """
    
    logits, labels = y_pred
#     print("y_pred:", y_pred, type(y_pred))
#     print("logits:", logits, type(logits))
#     print("labels:", labels, type(labels))
    
    # Compute accuracy, precision, recall and F1-score.
    predictions = logits.argmax(axis=-1)
#     print("predictions:", predictions)
#     precision, recall, f1, num_each_label_in_y_true = precision_recall_fscore_support(labels, predictions, average='binary')
#     accuracy = accuracy_score(labels, predictions)
    accuracy_score = accuracy.compute(references=labels, predictions=predictions)
#     print("accuracy_score:", accuracy_score)
    
    precision_score = precision.compute(references=labels, predictions=predictions)
#     print("precision_score:", precision_score)
    
    recall_score = recall.compute(references=labels, predictions=predictions)
#     print("recall_score:", recall_score)
    
    f1_score = f1.compute(references=labels, predictions=predictions)
#     print("f1_score:", f1_score)
    
    # Compute AUC.
#     probs = logits[:, 1]
#     print("probs:", probs)
#     auc = roc_auc_score(labels, probs)
    auc_score = auc.compute(references=labels, prediction_scores=predictions)
#     print("auc_score:", auc_score)
    
    return {
        'auc': auc_score['roc_auc'],
        'precision': precision_score['precision'],
        'recall': recall_score['recall'],
        'f1': f1_score['f1'],
        'accuracy': accuracy_score['accuracy'],
    }

In [18]:
# Set early stopping.
earlystopping_callback = EarlyStoppingCallback(
    early_stopping_patience=5, 
    early_stopping_threshold=0.0,
)

In [19]:
training_args = TrainingArguments(
    output_dir='./output/template_evtent_ft_3/',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    optim=optimizer,
    learning_rate=lr,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    num_train_epochs=num_epochs,
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='auc',
    greater_is_better=True,
    remove_unused_columns=False,
)
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=True,
group_by_

In [20]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
#     train_dataset=small_train_dataset,
    eval_dataset=dataset['dev'],
#     eval_dataset=small_dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer

<transformers.trainer.Trainer at 0x7ff68426ebc0>

In [21]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Auc,Precision,Recall,F1,Accuracy
1,0.6421,0.541163,0.728013,0.714724,0.758958,0.736177,0.728013
2,0.4897,0.514999,0.741042,0.786822,0.661238,0.718584,0.741042
3,0.3605,0.508871,0.770358,0.786207,0.742671,0.763819,0.770358
4,0.2967,0.72452,0.758958,0.74613,0.785016,0.765079,0.758958
5,0.2382,0.877769,0.750814,0.745223,0.762215,0.753623,0.750814
6,0.2085,1.031265,0.7557,0.76431,0.739414,0.751656,0.7557
7,0.1918,1.011652,0.7557,0.754045,0.758958,0.756494,0.7557
8,0.1589,1.315999,0.752443,0.750809,0.7557,0.753247,0.752443


TrainOutput(global_step=2456, training_loss=0.3232909584666696, metrics={'train_runtime': 3219.1533, 'train_samples_per_second': 15.256, 'train_steps_per_second': 0.954, 'total_flos': 3.661371940243046e+16, 'train_loss': 0.3232909584666696, 'epoch': 8.0})

In [32]:
model.save_pretrained('./final_ckpt/template_evtent_ft_3/')
tokenizer.save_pretrained('./final_ckpt/template_evtent_ft_3/')

('./final_ckpt/template_evtent_ft_3/tokenizer_config.json',
 './final_ckpt/template_evtent_ft_3/special_tokens_map.json',
 './final_ckpt/template_evtent_ft_3/vocab.json',
 './final_ckpt/template_evtent_ft_3/merges.txt',
 './final_ckpt/template_evtent_ft_3/added_tokens.json',
 './final_ckpt/template_evtent_ft_3/tokenizer.json')

### Evaluation

In [22]:
# Load test set.
testset = load_from_disk(DATA_PATH + 'testsets_5-2_1/')
testset = testset.remove_columns('id')
testset = testset.remove_columns('prompt')
testset = testset.with_format('torch')
testset

DatasetDict({
    pap: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 308
    })
    pep: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 307
    })
})

In [23]:
pap_test = testset['pap']
pap_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 308
})

In [24]:
pep_test = testset['pep']
pep_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 307
})

### Evaluate the final model checkpoint.

In [25]:
# Predict fine-tuned model on pap_test.
preds_pap_test = trainer.predict(pap_test)
preds_pap_test

PredictionOutput(predictions=array([[-0.07695419,  0.09097256],
       [ 1.10351   , -1.0288893 ],
       [ 0.3544017 , -0.4355814 ],
       [-1.6715767 ,  1.8748543 ],
       [ 0.6496273 , -0.6409176 ],
       [-0.43215907,  1.1121173 ],
       [ 0.53248394, -0.6034033 ],
       [ 1.7040869 , -1.5045595 ],
       [-0.81670874,  1.0158082 ],
       [-2.1329386 ,  2.829992  ],
       [ 0.6192957 , -0.6054772 ],
       [-0.26018614,  0.15806405],
       [ 0.24840078, -0.3205238 ],
       [ 0.46864712, -0.51844746],
       [ 0.35085532, -0.36399367],
       [-0.49660116,  0.66711694],
       [-1.261898  ,  1.999942  ],
       [ 0.72379684, -0.80886185],
       [ 0.57491815, -0.3234126 ],
       [ 2.2315798 , -2.168121  ],
       [-0.70688415,  1.0164164 ],
       [-0.80264354,  0.6064186 ],
       [ 0.98902154, -0.6367403 ],
       [ 0.1853594 , -0.14774019],
       [ 0.58409756, -0.44243842],
       [ 0.45929134, -0.7494145 ],
       [-1.2410796 ,  1.753201  ],
       [-0.30936986,  0.74

In [26]:
# Evaluate fine-tuned model on pap_test.
scores_pap_test = trainer.evaluate(pap_test)
scores_pap_test

{'eval_loss': 0.7246501445770264,
 'eval_auc': 0.6590909090909091,
 'eval_precision': 0.7168141592920354,
 'eval_recall': 0.525974025974026,
 'eval_f1': 0.6067415730337079,
 'eval_accuracy': 0.6590909090909091,
 'eval_runtime': 7.4504,
 'eval_samples_per_second': 41.34,
 'eval_steps_per_second': 2.684,
 'epoch': 8.0}

In [27]:
# Predict fine-tuned model on pep_test.
preds_pep_test = trainer.predict(pep_test)
preds_pep_test

PredictionOutput(predictions=array([[-0.37333322,  0.51198786],
       [-0.23042738,  0.8809738 ],
       [-1.3442569 ,  2.0817366 ],
       [-1.3686925 ,  2.1946526 ],
       [ 0.26165345, -0.06819616],
       [-1.3039305 ,  2.191734  ],
       [ 2.6200514 , -2.908581  ],
       [ 1.3393333 , -1.2048677 ],
       [ 2.053032  , -2.1532092 ],
       [ 0.90846664, -0.8564866 ],
       [ 0.01263479, -0.05534066],
       [-1.8231641 ,  2.5171142 ],
       [ 0.93718106, -0.63805896],
       [ 1.7193288 , -1.7538651 ],
       [ 1.8379629 , -1.788284  ],
       [ 1.831647  , -2.3450916 ],
       [ 2.2911506 , -2.4321241 ],
       [-1.2149854 ,  1.7858719 ],
       [ 0.6475396 , -0.5850425 ],
       [ 2.0737739 , -2.1893568 ],
       [-2.2252767 ,  2.984403  ],
       [-1.6022655 ,  2.2856288 ],
       [-0.5508145 ,  1.2032682 ],
       [-2.0608132 ,  2.9585154 ],
       [-0.9664281 ,  1.4796926 ],
       [-1.637783  ,  2.4487643 ],
       [-1.0616807 ,  1.3889828 ],
       [ 1.3336751 , -1.48

In [28]:
# Evaluate fine-tuned model on pep_test.
scores_pep_test = trainer.evaluate(pep_test)
scores_pep_test

{'eval_loss': 0.3405836820602417,
 'eval_auc': 0.8828834564128681,
 'eval_precision': 0.8502994011976048,
 'eval_recall': 0.9281045751633987,
 'eval_f1': 0.8875000000000001,
 'eval_accuracy': 0.8827361563517915,
 'eval_runtime': 7.4674,
 'eval_samples_per_second': 41.112,
 'eval_steps_per_second': 2.678,
 'epoch': 8.0}

In [29]:
softmax = torch.nn.Softmax(dim=1)
softmax

Softmax(dim=1)

In [30]:
probs_pap_test = softmax(torch.tensor(preds_pap_test.predictions)).tolist()

y_pred_pap_test = np.argmax(preds_pap_test.predictions, axis=-1).tolist()

metrics_pap_test = preds_pap_test.metrics
metrics_pap_test['test_loss'] = scores_pap_test['eval_loss']
metrics_pap_test['test_runtime'] = scores_pap_test['eval_runtime']
metrics_pap_test['test_samples_per_second'] = scores_pap_test['eval_samples_per_second']
metrics_pap_test['test_steps_per_second'] = scores_pap_test['eval_steps_per_second']

with open('./evaluation_outputs/template_evtent_ft_3/preds_pap.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test, 
               'y_pred': y_pred_pap_test, 
               'y_true': preds_pap_test.label_ids.tolist(), 
               'metrics': metrics_pap_test,
              }, file)

In [31]:
probs_pep_test = softmax(torch.tensor(preds_pep_test.predictions)).tolist()

y_pred_pep_test = np.argmax(preds_pep_test.predictions, axis=-1).tolist()

metrics_pep_test = preds_pep_test.metrics
metrics_pep_test['test_loss'] = scores_pep_test['eval_loss']
metrics_pep_test['test_runtime'] = scores_pep_test['eval_runtime']
metrics_pep_test['test_samples_per_second'] = scores_pep_test['eval_samples_per_second']
metrics_pep_test['test_steps_per_second'] = scores_pep_test['eval_steps_per_second']

with open('./evaluation_outputs/template_evtent_ft_3/preds_pep.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test, 
               'y_pred': y_pred_pep_test, 
               'y_true': preds_pep_test.label_ids.tolist(), 
               'metrics': metrics_pep_test,
              }, file)

### Load model checkpoint with lowest validation loss.
In the training_args above, I set metric_for_best_model='auc'. This is suboptimal since the model checkpoint, for which the validation loss is the lowest and is about to increase, should be considered the best model checkpoint.

In [79]:
tokenizer_best = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path='./output/template_evtent_ft_3/checkpoint-921/',
    cache_dir=CACHE_DIR,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [80]:
print(tokenizer_best.vocab_size)
print(len(tokenizer_best.vocab))
tokenizer_best.special_tokens_map

50265
50275


{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['[ETYPE]',
  '[/STYPE]',
  '[OTYPE]',
  '[/OTYPE]',
  '[EVT]',
  '[DEF]',
  '[/EVT]',
  '[STYPE]',
  '[/ETYPE]',
  '[/DEF]']}

In [81]:
print(model_config.vocab_size)
model_config.vocab_size = len(tokenizer_best)
print(model_config.vocab_size)

50275
50275


In [82]:
model_best = RobertaForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path='./output/template_evtent_ft_3/checkpoint-921/',
    config=model_config, 
    cache_dir=CACHE_DIR, 
    ignore_mismatched_sizes=True,
)
model_best.resize_token_embeddings(len(tokenizer_best))
model_best = model_best.to(device)
model_best.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50275, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [83]:
trainer_eval = Trainer(
    model=model_best,
    tokenizer=tokenizer_best,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['dev'],
    compute_metrics=compute_metrics,
    callbacks=[earlystopping_callback],
)
trainer_eval

<transformers.trainer.Trainer at 0x7ff6351e3940>

In [84]:
# Predict the best checkpoint (the one with the lowest validation error) on pap_test.
preds_pap_best_model = trainer_eval.predict(pap_test)
preds_pap_best_model

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


PredictionOutput(predictions=array([[-0.07695419,  0.09097256],
       [ 1.10351   , -1.0288893 ],
       [ 0.3544017 , -0.4355814 ],
       [-1.6715767 ,  1.8748543 ],
       [ 0.6496273 , -0.6409176 ],
       [-0.43215907,  1.1121173 ],
       [ 0.53248394, -0.6034033 ],
       [ 1.7040869 , -1.5045595 ],
       [-0.81670874,  1.0158082 ],
       [-2.1329386 ,  2.829992  ],
       [ 0.6192957 , -0.6054772 ],
       [-0.26018614,  0.15806405],
       [ 0.24840078, -0.3205238 ],
       [ 0.46864712, -0.51844746],
       [ 0.35085532, -0.36399367],
       [-0.49660116,  0.66711694],
       [-1.261898  ,  1.999942  ],
       [ 0.72379684, -0.80886185],
       [ 0.57491815, -0.3234126 ],
       [ 2.2315798 , -2.168121  ],
       [-0.70688415,  1.0164164 ],
       [-0.80264354,  0.6064186 ],
       [ 0.98902154, -0.6367403 ],
       [ 0.1853594 , -0.14774019],
       [ 0.58409756, -0.44243842],
       [ 0.45929134, -0.7494145 ],
       [-1.2410796 ,  1.753201  ],
       [-0.30936986,  0.74

In [85]:
# Evaluate the best checkpoint (the one with the lowest validation error) on pap_test.
scores_pap_best_model = trainer_eval.evaluate(pap_test)
scores_pap_best_model

{'eval_loss': 0.7246501445770264,
 'eval_auc': 0.6590909090909091,
 'eval_precision': 0.7168141592920354,
 'eval_recall': 0.525974025974026,
 'eval_f1': 0.6067415730337079,
 'eval_accuracy': 0.6590909090909091,
 'eval_runtime': 7.512,
 'eval_samples_per_second': 41.001,
 'eval_steps_per_second': 2.662}

In [86]:
probs_pap_test_best_model = softmax(torch.tensor(preds_pap_best_model.predictions)).tolist()

y_pred_pap_test_best_model = np.argmax(preds_pap_best_model.predictions, axis=-1).tolist()

metrics_pap_test_best_model = preds_pap_best_model.metrics
metrics_pap_test_best_model['test_loss'] = scores_pap_best_model['eval_loss']
metrics_pap_test_best_model['test_runtime'] = scores_pap_best_model['eval_runtime']
metrics_pap_test_best_model['test_samples_per_second'] = scores_pap_best_model['eval_samples_per_second']
metrics_pap_test_best_model['test_steps_per_second'] = scores_pap_best_model['eval_steps_per_second']

with open('./evaluation_outputs/template_evtent_ft_3/preds_pap_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pap_test_best_model, 
               'y_pred': y_pred_pap_test_best_model, 
               'y_true': preds_pap_best_model.label_ids.tolist(), 
               'metrics': metrics_pap_test_best_model,
              }, file)

In [87]:
# Predict the best checkpoint (the one with the highest auc) on pep_test.
preds_pep_best_model = trainer_eval.predict(pep_test)
preds_pep_best_model

PredictionOutput(predictions=array([[-0.37333322,  0.51198786],
       [-0.23042738,  0.8809738 ],
       [-1.3442569 ,  2.0817366 ],
       [-1.3686925 ,  2.1946526 ],
       [ 0.26165345, -0.06819616],
       [-1.3039305 ,  2.191734  ],
       [ 2.6200514 , -2.908581  ],
       [ 1.3393333 , -1.2048677 ],
       [ 2.053032  , -2.1532092 ],
       [ 0.90846664, -0.8564866 ],
       [ 0.01263479, -0.05534066],
       [-1.8231641 ,  2.5171142 ],
       [ 0.93718106, -0.63805896],
       [ 1.7193288 , -1.7538651 ],
       [ 1.8379629 , -1.788284  ],
       [ 1.831647  , -2.3450916 ],
       [ 2.2911506 , -2.4321241 ],
       [-1.2149854 ,  1.7858719 ],
       [ 0.6475396 , -0.5850425 ],
       [ 2.0737739 , -2.1893568 ],
       [-2.2252767 ,  2.984403  ],
       [-1.6022655 ,  2.2856288 ],
       [-0.5508145 ,  1.2032682 ],
       [-2.0608132 ,  2.9585154 ],
       [-0.9664281 ,  1.4796926 ],
       [-1.637783  ,  2.4487643 ],
       [-1.0616807 ,  1.3889828 ],
       [ 1.3336751 , -1.48

In [88]:
# Evaluate the best checkpoint (the one with the highest auc) on pep_test.
scores_pep_best_model = trainer_eval.evaluate(pep_test)
scores_pep_best_model

{'eval_loss': 0.3405836820602417,
 'eval_auc': 0.8828834564128681,
 'eval_precision': 0.8502994011976048,
 'eval_recall': 0.9281045751633987,
 'eval_f1': 0.8875000000000001,
 'eval_accuracy': 0.8827361563517915,
 'eval_runtime': 7.4134,
 'eval_samples_per_second': 41.412,
 'eval_steps_per_second': 2.698}

In [89]:
probs_pep_test_best_model = softmax(torch.tensor(preds_pep_best_model.predictions)).tolist()

y_pred_pep_test_best_model = np.argmax(preds_pep_best_model.predictions, axis=-1).tolist()

metrics_pep_test_best_model = preds_pep_best_model.metrics
metrics_pep_test_best_model['test_loss'] = scores_pep_best_model['eval_loss']
metrics_pep_test_best_model['test_runtime'] = scores_pep_best_model['eval_runtime']
metrics_pep_test_best_model['test_samples_per_second'] = scores_pep_best_model['eval_samples_per_second']
metrics_pep_test_best_model['test_steps_per_second'] = scores_pep_best_model['eval_steps_per_second']

with open('./evaluation_outputs/template_evtent_ft_3/preds_pep_best_ckpt.json', 'w') as file:
    json.dump({'probabilities': probs_pep_test_best_model, 
               'y_pred': y_pred_pep_test_best_model, 
               'y_true': preds_pep_best_model.label_ids.tolist(), 
               'metrics': metrics_pep_test_best_model,
              }, file)